diff --git a/.gitattributes b/.gitattributes index 0f95a3eb12c46633af28b36dff19e7b9a2f0fab7..450694353054abb5890e37434207b87c07eeb1fc 100644 --- a/.gitattributes +++ b/.gitattributes @@ -382,3 +382,94 @@ productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_feature productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4160/tokenizer.json filter=lfs diff=lfs merge=lfs -text productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-832/tokenizer.json filter=lfs diff=lfs merge=lfs -text productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1266/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1688/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2110/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2532/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2954/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3376/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3798/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-422/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4220/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-844/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1266/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1688/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2110/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2532/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2954/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3376/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3798/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-422/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4220/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-844/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1248/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1664/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2080/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2496/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2912/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3328/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3744/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-416/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4160/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1248/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-832/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1664/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2080/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2496/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2912/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3328/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3744/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-416/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4160/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-832/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1266/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1688/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2110/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2532/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2954/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3376/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3798/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-422/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4220/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-844/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1266/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1688/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2110/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2532/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2954/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3376/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3798/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-422/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4220/tokenizer.json filter=lfs diff=lfs merge=lfs -text +productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-844/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1040/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1020/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1060/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1080/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1120/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1140/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1160/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1180/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-120/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1220/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1240/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1260/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1280/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1320/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1340/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1360/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1380/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-140/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1420/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1440/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1460/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1480/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1520/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1540/tokenizer.json filter=lfs diff=lfs merge=lfs -text +overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1560/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3465/tokenizer_config.json b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3465/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3465/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3465/trainer_state.json b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3465/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..517d7bc37df454fdc9335a848e9d75c44555034f --- /dev/null +++ b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3465/trainer_state.json @@ -0,0 +1,823 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.0, + "eval_steps": 500, + "global_step": 3465, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.2479030179977415, + "epoch": 0.1299545159194282, + "grad_norm": 1.519571304321289, + "learning_rate": 3.522207847653314e-05, + "loss": 2.093206329345703, + "mean_token_accuracy": 0.6068353663384914, + "num_tokens": 154518.0, + "step": 50 + }, + { + "entropy": 0.932415626347065, + "epoch": 0.2599090318388564, + "grad_norm": 1.180830955505371, + "learning_rate": 7.11629748811588e-05, + "loss": 0.8930854797363281, + "mean_token_accuracy": 0.7708445385098457, + "num_tokens": 306733.0, + "step": 100 + }, + { + "entropy": 0.7730373838543891, + "epoch": 0.3898635477582846, + "grad_norm": 0.7839977145195007, + "learning_rate": 0.00010710387128578447, + "loss": 0.7302116394042969, + "mean_token_accuracy": 0.8012136635184288, + "num_tokens": 446267.0, + "step": 150 + }, + { + "entropy": 0.6934178560972214, + "epoch": 0.5198180636777128, + "grad_norm": 0.666778564453125, + "learning_rate": 0.0001430447676904101, + "loss": 0.6505754852294922, + "mean_token_accuracy": 0.8195212116837501, + "num_tokens": 600256.0, + "step": 200 + }, + { + "entropy": 0.6900296103954315, + "epoch": 0.649772579597141, + "grad_norm": 0.6762415766716003, + "learning_rate": 0.00017898566409503577, + "loss": 0.6378536987304687, + "mean_token_accuracy": 0.8223087686300278, + "num_tokens": 738649.0, + "step": 250 + }, + { + "entropy": 0.667421719878912, + "epoch": 0.7797270955165692, + "grad_norm": 0.5047685503959656, + "learning_rate": 0.00021492656049966144, + "loss": 0.6148524856567383, + "mean_token_accuracy": 0.8280292323231697, + "num_tokens": 883494.0, + "step": 300 + }, + { + "entropy": 0.6388977643847465, + "epoch": 0.9096816114359974, + "grad_norm": 0.4360353350639343, + "learning_rate": 0.0002508674569042871, + "loss": 0.5933729553222656, + "mean_token_accuracy": 0.8329134130477905, + "num_tokens": 1032111.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.6612381511009656, + "eval_loss": 0.6559221744537354, + "eval_mean_token_accuracy": 0.8195324820967821, + "eval_num_tokens": 1132140.0, + "eval_runtime": 53.4007, + "eval_samples_per_second": 31.03, + "eval_steps_per_second": 3.895, + "step": 385 + }, + { + "entropy": 0.6224366770916848, + "epoch": 1.0389863547758285, + "grad_norm": 0.5294668078422546, + "learning_rate": 0.00027673375518355765, + "loss": 0.5677951431274414, + "mean_token_accuracy": 0.8380465067211708, + "num_tokens": 1177556.0, + "step": 400 + }, + { + "entropy": 0.5827466724812984, + "epoch": 1.1689408706952567, + "grad_norm": 0.5172416567802429, + "learning_rate": 0.0002765120122346144, + "loss": 0.5423126983642578, + "mean_token_accuracy": 0.8467991036176682, + "num_tokens": 1325434.0, + "step": 450 + }, + { + "entropy": 0.5831517253816128, + "epoch": 1.2988953866146848, + "grad_norm": 0.41916292905807495, + "learning_rate": 0.0002760064270819138, + "loss": 0.534448013305664, + "mean_token_accuracy": 0.8456632816791534, + "num_tokens": 1474116.0, + "step": 500 + }, + { + "entropy": 0.5869986982643605, + "epoch": 1.428849902534113, + "grad_norm": 0.4387759566307068, + "learning_rate": 0.00027521803857633113, + "loss": 0.5367491912841796, + "mean_token_accuracy": 0.8462416216731071, + "num_tokens": 1621193.0, + "step": 550 + }, + { + "entropy": 0.5771756853163242, + "epoch": 1.5588044184535412, + "grad_norm": 0.49079665541648865, + "learning_rate": 0.00027414846665880935, + "loss": 0.5238623809814453, + "mean_token_accuracy": 0.84760089635849, + "num_tokens": 1767789.0, + "step": 600 + }, + { + "entropy": 0.5549105909466744, + "epoch": 1.6887589343729694, + "grad_norm": 0.4000363051891327, + "learning_rate": 0.0002727999090317863, + "loss": 0.510434226989746, + "mean_token_accuracy": 0.8517858856916427, + "num_tokens": 1918138.0, + "step": 650 + }, + { + "entropy": 0.583413660377264, + "epoch": 1.8187134502923976, + "grad_norm": 0.33592426776885986, + "learning_rate": 0.00027117513664346674, + "loss": 0.5297993850708008, + "mean_token_accuracy": 0.846615691781044, + "num_tokens": 2057575.0, + "step": 700 + }, + { + "entropy": 0.5732646904885769, + "epoch": 1.9486679662118258, + "grad_norm": 0.5528839230537415, + "learning_rate": 0.00026927748799421714, + "loss": 0.5219194793701172, + "mean_token_accuracy": 0.8489033079147339, + "num_tokens": 2208320.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.6027900550801021, + "eval_loss": 0.5946928858757019, + "eval_mean_token_accuracy": 0.8318195798649237, + "eval_num_tokens": 2264280.0, + "eval_runtime": 53.3837, + "eval_samples_per_second": 31.039, + "eval_steps_per_second": 3.896, + "step": 770 + }, + { + "entropy": 0.5329899657611272, + "epoch": 2.077972709551657, + "grad_norm": 0.45793575048446655, + "learning_rate": 0.0002671108622767842, + "loss": 0.48420516967773436, + "mean_token_accuracy": 0.8578200301333289, + "num_tokens": 2348248.0, + "step": 800 + }, + { + "entropy": 0.5142687204480171, + "epoch": 2.207927225471085, + "grad_norm": 0.4690960645675659, + "learning_rate": 0.0002646797113644295, + "loss": 0.4593114471435547, + "mean_token_accuracy": 0.8622670090198516, + "num_tokens": 2501427.0, + "step": 850 + }, + { + "entropy": 0.5135884112119675, + "epoch": 2.3378817413905133, + "grad_norm": 0.3752821683883667, + "learning_rate": 0.00026198903066344565, + "loss": 0.4626216125488281, + "mean_token_accuracy": 0.8612511262297631, + "num_tokens": 2650794.0, + "step": 900 + }, + { + "entropy": 0.5137367483973503, + "epoch": 2.4678362573099415, + "grad_norm": 0.3726271390914917, + "learning_rate": 0.0002590443488488465, + "loss": 0.4601683807373047, + "mean_token_accuracy": 0.8620512077212333, + "num_tokens": 2798180.0, + "step": 950 + }, + { + "entropy": 0.5105714881420136, + "epoch": 2.5977907732293697, + "grad_norm": 0.41296717524528503, + "learning_rate": 0.00025585171650432525, + "loss": 0.46279102325439453, + "mean_token_accuracy": 0.8611763519048691, + "num_tokens": 2950301.0, + "step": 1000 + }, + { + "entropy": 0.5169161760807037, + "epoch": 2.727745289148798, + "grad_norm": 0.4614253044128418, + "learning_rate": 0.0002524176936898197, + "loss": 0.45492774963378907, + "mean_token_accuracy": 0.8627680170536042, + "num_tokens": 3091810.0, + "step": 1050 + }, + { + "entropy": 0.4989277676492929, + "epoch": 2.857699805068226, + "grad_norm": 0.37512704730033875, + "learning_rate": 0.00024874933646223225, + "loss": 0.4531984329223633, + "mean_token_accuracy": 0.8637665447592735, + "num_tokens": 3242184.0, + "step": 1100 + }, + { + "entropy": 0.5177617704868317, + "epoch": 2.9876543209876543, + "grad_norm": 0.3700532019138336, + "learning_rate": 0.00024485418237699976, + "loss": 0.45844474792480466, + "mean_token_accuracy": 0.8626988258957863, + "num_tokens": 3382605.0, + "step": 1150 + }, + { + "epoch": 3.0, + "eval_entropy": 0.5253989950108987, + "eval_loss": 0.5857328176498413, + "eval_mean_token_accuracy": 0.8360884573597175, + "eval_num_tokens": 3396420.0, + "eval_runtime": 53.341, + "eval_samples_per_second": 31.064, + "eval_steps_per_second": 3.899, + "step": 1155 + }, + { + "entropy": 0.4535361140517134, + "epoch": 3.116959064327485, + "grad_norm": 0.3412795662879944, + "learning_rate": 0.00024074023500030492, + "loss": 0.3942829132080078, + "mean_token_accuracy": 0.8781378038564519, + "num_tokens": 3522582.0, + "step": 1200 + }, + { + "entropy": 0.44747897461056707, + "epoch": 3.246913580246914, + "grad_norm": 0.46647050976753235, + "learning_rate": 0.0002364159474637521, + "loss": 0.38986759185791015, + "mean_token_accuracy": 0.8777281475067139, + "num_tokens": 3670864.0, + "step": 1250 + }, + { + "entropy": 0.4480265176296234, + "epoch": 3.3768680961663415, + "grad_norm": 0.4068582355976105, + "learning_rate": 0.00023189020509529866, + "loss": 0.39444759368896487, + "mean_token_accuracy": 0.8774515727162361, + "num_tokens": 3822021.0, + "step": 1300 + }, + { + "entropy": 0.45180007234215736, + "epoch": 3.50682261208577, + "grad_norm": 0.4249928593635559, + "learning_rate": 0.00022717230716213122, + "loss": 0.3977077102661133, + "mean_token_accuracy": 0.8762744688987731, + "num_tokens": 3968736.0, + "step": 1350 + }, + { + "entropy": 0.4614932192862034, + "epoch": 3.636777128005198, + "grad_norm": 0.561008095741272, + "learning_rate": 0.00022227194776300045, + "loss": 0.4022808456420898, + "mean_token_accuracy": 0.8760285252332687, + "num_tokens": 4113509.0, + "step": 1400 + }, + { + "entropy": 0.4414680179953575, + "epoch": 3.7667316439246266, + "grad_norm": 0.38943538069725037, + "learning_rate": 0.00021719919590927584, + "loss": 0.38586376190185545, + "mean_token_accuracy": 0.8783121705055237, + "num_tokens": 4267958.0, + "step": 1450 + }, + { + "entropy": 0.45685607343912127, + "epoch": 3.8966861598440543, + "grad_norm": 0.5362406969070435, + "learning_rate": 0.00021196447483564875, + "loss": 0.3983576583862305, + "mean_token_accuracy": 0.8764419692754746, + "num_tokens": 4415398.0, + "step": 1500 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49981127894268584, + "eval_loss": 0.5997208952903748, + "eval_mean_token_accuracy": 0.8368159819107789, + "eval_num_tokens": 4528560.0, + "eval_runtime": 53.4304, + "eval_samples_per_second": 31.012, + "eval_steps_per_second": 3.893, + "step": 1540 + }, + { + "entropy": 0.4439909208060509, + "epoch": 4.025990903183885, + "grad_norm": 0.5490113496780396, + "learning_rate": 0.00020657854058299564, + "loss": 0.38307292938232423, + "mean_token_accuracy": 0.8795150506436525, + "num_tokens": 4559534.0, + "step": 1550 + }, + { + "entropy": 0.3837250977009535, + "epoch": 4.155945419103314, + "grad_norm": 0.5567234754562378, + "learning_rate": 0.0002010524598974076, + "loss": 0.3182963752746582, + "mean_token_accuracy": 0.8964017608761787, + "num_tokens": 4707075.0, + "step": 1600 + }, + { + "entropy": 0.377872094810009, + "epoch": 4.2858999350227425, + "grad_norm": 0.4315710961818695, + "learning_rate": 0.00019539758749079845, + "loss": 0.318333683013916, + "mean_token_accuracy": 0.8963816618919372, + "num_tokens": 4851683.0, + "step": 1650 + }, + { + "entropy": 0.38739304527640345, + "epoch": 4.41585445094217, + "grad_norm": 0.49140632152557373, + "learning_rate": 0.00018962554270981555, + "loss": 0.32688804626464846, + "mean_token_accuracy": 0.8937860554456711, + "num_tokens": 4994086.0, + "step": 1700 + }, + { + "entropy": 0.39157475270330905, + "epoch": 4.545808966861598, + "grad_norm": 0.40667369961738586, + "learning_rate": 0.00018374818566099208, + "loss": 0.3305763626098633, + "mean_token_accuracy": 0.8916732975840569, + "num_tokens": 5137171.0, + "step": 1750 + }, + { + "entropy": 0.3838599680364132, + "epoch": 4.675763482781027, + "grad_norm": 0.4632417857646942, + "learning_rate": 0.0001777775928411983, + "loss": 0.3267818450927734, + "mean_token_accuracy": 0.8946500706672669, + "num_tokens": 5287076.0, + "step": 1800 + }, + { + "entropy": 0.38270787581801413, + "epoch": 4.805717998700455, + "grad_norm": 0.5529720187187195, + "learning_rate": 0.0001717260323234649, + "loss": 0.3264235305786133, + "mean_token_accuracy": 0.8948800846934318, + "num_tokens": 5436923.0, + "step": 1850 + }, + { + "entropy": 0.38736109718680384, + "epoch": 4.935672514619883, + "grad_norm": 0.5604785680770874, + "learning_rate": 0.00016560593854916497, + "loss": 0.3280513381958008, + "mean_token_accuracy": 0.8931388029456139, + "num_tokens": 5589195.0, + "step": 1900 + }, + { + "epoch": 5.0, + "eval_entropy": 0.4370518812479881, + "eval_loss": 0.6210553050041199, + "eval_mean_token_accuracy": 0.8379779781859654, + "eval_num_tokens": 5660700.0, + "eval_runtime": 53.4039, + "eval_samples_per_second": 31.028, + "eval_steps_per_second": 3.895, + "step": 1925 + }, + { + "entropy": 0.3330705868988181, + "epoch": 5.064977257959714, + "grad_norm": 0.5386723875999451, + "learning_rate": 0.0001594298867783512, + "loss": 0.2754818344116211, + "mean_token_accuracy": 0.9101201346771202, + "num_tokens": 5739445.0, + "step": 1950 + }, + { + "entropy": 0.2980620255321264, + "epoch": 5.1949317738791425, + "grad_norm": 0.5633581876754761, + "learning_rate": 0.00015321056725074549, + "loss": 0.23754241943359375, + "mean_token_accuracy": 0.9203532826900482, + "num_tokens": 5888043.0, + "step": 2000 + }, + { + "entropy": 0.3111592583358288, + "epoch": 5.32488628979857, + "grad_norm": 0.5031015872955322, + "learning_rate": 0.0001469607591104745, + "loss": 0.24428102493286133, + "mean_token_accuracy": 0.917181601524353, + "num_tokens": 6031284.0, + "step": 2050 + }, + { + "entropy": 0.31522042460739613, + "epoch": 5.454840805717999, + "grad_norm": 0.6432453393936157, + "learning_rate": 0.0001406933041481286, + "loss": 0.25112478256225584, + "mean_token_accuracy": 0.9152472382783889, + "num_tokens": 6179927.0, + "step": 2100 + }, + { + "entropy": 0.3046229027956724, + "epoch": 5.584795321637427, + "grad_norm": 0.5104537606239319, + "learning_rate": 0.00013442108041409814, + "loss": 0.2431495475769043, + "mean_token_accuracy": 0.917829519212246, + "num_tokens": 6322630.0, + "step": 2150 + }, + { + "entropy": 0.30440517760813235, + "epoch": 5.714749837556855, + "grad_norm": 0.5307765603065491, + "learning_rate": 0.0001281569757574053, + "loss": 0.24610313415527343, + "mean_token_accuracy": 0.9166415151953697, + "num_tokens": 6469843.0, + "step": 2200 + }, + { + "entropy": 0.304359400421381, + "epoch": 5.844704353476283, + "grad_norm": 0.5014523267745972, + "learning_rate": 0.00012191386134440133, + "loss": 0.24548973083496095, + "mean_token_accuracy": 0.9165477308630944, + "num_tokens": 6617768.0, + "step": 2250 + }, + { + "entropy": 0.3141418205201626, + "epoch": 5.974658869395712, + "grad_norm": 0.567398726940155, + "learning_rate": 0.00011570456521174339, + "loss": 0.24975168228149414, + "mean_token_accuracy": 0.9139353120326996, + "num_tokens": 6761187.0, + "step": 2300 + }, + { + "epoch": 6.0, + "eval_entropy": 0.3758909202252443, + "eval_loss": 0.6843022108078003, + "eval_mean_token_accuracy": 0.8348075449466705, + "eval_num_tokens": 6792840.0, + "eval_runtime": 53.3825, + "eval_samples_per_second": 31.04, + "eval_steps_per_second": 3.896, + "step": 2310 + }, + { + "entropy": 0.24346149432000203, + "epoch": 6.1039636127355426, + "grad_norm": 0.7140825986862183, + "learning_rate": 0.00010954184590799172, + "loss": 0.17231273651123047, + "mean_token_accuracy": 0.9407721275660261, + "num_tokens": 6909578.0, + "step": 2350 + }, + { + "entropy": 0.2160973483324051, + "epoch": 6.23391812865497, + "grad_norm": 0.49014952778816223, + "learning_rate": 0.00010343836627798716, + "loss": 0.15455107688903807, + "mean_token_accuracy": 0.9467655989527702, + "num_tokens": 7056244.0, + "step": 2400 + }, + { + "entropy": 0.21491924367845058, + "epoch": 6.363872644574399, + "grad_norm": 0.5529471635818481, + "learning_rate": 9.740666744387656e-05, + "loss": 0.1584029197692871, + "mean_token_accuracy": 0.9460993978381157, + "num_tokens": 7206950.0, + "step": 2450 + }, + { + "entropy": 0.22037068914622068, + "epoch": 6.493827160493828, + "grad_norm": 0.6232843995094299, + "learning_rate": 9.145914303624717e-05, + "loss": 0.15544342041015624, + "mean_token_accuracy": 0.9450622496008872, + "num_tokens": 7359429.0, + "step": 2500 + }, + { + "entropy": 0.2320463878661394, + "epoch": 6.623781676413255, + "grad_norm": 0.7459681630134583, + "learning_rate": 8.560801372831975e-05, + "loss": 0.16350215911865235, + "mean_token_accuracy": 0.9416968420147895, + "num_tokens": 7499281.0, + "step": 2550 + }, + { + "entropy": 0.22943626195192338, + "epoch": 6.753736192332683, + "grad_norm": 0.7482302784919739, + "learning_rate": 7.986530212552506e-05, + "loss": 0.16422538757324218, + "mean_token_accuracy": 0.9434959614276885, + "num_tokens": 7640758.0, + "step": 2600 + }, + { + "entropy": 0.21795938543975354, + "epoch": 6.883690708252112, + "grad_norm": 0.5210486054420471, + "learning_rate": 7.424280806206118e-05, + "loss": 0.15540474891662598, + "mean_token_accuracy": 0.9459306105971337, + "num_tokens": 7791986.0, + "step": 2650 + }, + { + "epoch": 7.0, + "eval_entropy": 0.3157534837149657, + "eval_loss": 0.7744874954223633, + "eval_mean_token_accuracy": 0.8346395036922052, + "eval_num_tokens": 7924980.0, + "eval_runtime": 53.3771, + "eval_samples_per_second": 31.043, + "eval_steps_per_second": 3.897, + "step": 2695 + }, + { + "entropy": 0.2124774060656677, + "epoch": 7.012995451591943, + "grad_norm": 0.42238789796829224, + "learning_rate": 6.875208435518865e-05, + "loss": 0.14792531967163086, + "mean_token_accuracy": 0.9490461115861059, + "num_tokens": 7940521.0, + "step": 2700 + }, + { + "entropy": 0.15692965138703585, + "epoch": 7.142949967511371, + "grad_norm": 0.4711572229862213, + "learning_rate": 6.340441306708468e-05, + "loss": 0.09051708221435546, + "mean_token_accuracy": 0.9700166273117066, + "num_tokens": 8084193.0, + "step": 2750 + }, + { + "entropy": 0.15241683423519134, + "epoch": 7.272904483430799, + "grad_norm": 0.4312196671962738, + "learning_rate": 5.821078232303016e-05, + "loss": 0.08812363624572754, + "mean_token_accuracy": 0.9699361199140548, + "num_tokens": 8230159.0, + "step": 2800 + }, + { + "entropy": 0.1459079357981682, + "epoch": 7.402858999350228, + "grad_norm": 0.4804084002971649, + "learning_rate": 5.3181863733564636e-05, + "loss": 0.08675944328308105, + "mean_token_accuracy": 0.9703072866797448, + "num_tokens": 8380556.0, + "step": 2850 + }, + { + "entropy": 0.15621621005237102, + "epoch": 7.532813515269655, + "grad_norm": 0.5435478091239929, + "learning_rate": 4.83279904669986e-05, + "loss": 0.09016354560852051, + "mean_token_accuracy": 0.9674961140751839, + "num_tokens": 8523248.0, + "step": 2900 + }, + { + "entropy": 0.1545175113901496, + "epoch": 7.662768031189084, + "grad_norm": 0.524286687374115, + "learning_rate": 4.365913601734056e-05, + "loss": 0.09049141883850098, + "mean_token_accuracy": 0.9679373624920845, + "num_tokens": 8672002.0, + "step": 2950 + }, + { + "entropy": 0.1553485019877553, + "epoch": 7.792722547108512, + "grad_norm": 0.5006484389305115, + "learning_rate": 3.9184893711264495e-05, + "loss": 0.08913107872009278, + "mean_token_accuracy": 0.9684559822082519, + "num_tokens": 8816090.0, + "step": 3000 + }, + { + "entropy": 0.15444331549108029, + "epoch": 7.92267706302794, + "grad_norm": 0.5613893866539001, + "learning_rate": 3.491445699622611e-05, + "loss": 0.08711207389831543, + "mean_token_accuracy": 0.9684525722265244, + "num_tokens": 8966004.0, + "step": 3050 + }, + { + "epoch": 8.0, + "eval_entropy": 0.26580205430778175, + "eval_loss": 0.8996392488479614, + "eval_mean_token_accuracy": 0.8319417791297803, + "eval_num_tokens": 9057120.0, + "eval_runtime": 53.4042, + "eval_samples_per_second": 31.028, + "eval_steps_per_second": 3.895, + "step": 3080 + }, + { + "entropy": 0.14060429509860187, + "epoch": 8.05198180636777, + "grad_norm": 0.25378674268722534, + "learning_rate": 3.085660055023035e-05, + "loss": 0.07468742847442628, + "mean_token_accuracy": 0.9737296260181983, + "num_tokens": 9115929.0, + "step": 3100 + }, + { + "entropy": 0.12152639016509056, + "epoch": 8.1819363222872, + "grad_norm": 0.3634810447692871, + "learning_rate": 2.7019662252065798e-05, + "loss": 0.05918361663818359, + "mean_token_accuracy": 0.9785374769568443, + "num_tokens": 9260646.0, + "step": 3150 + }, + { + "entropy": 0.1228457903675735, + "epoch": 8.311890838206628, + "grad_norm": 0.32873299717903137, + "learning_rate": 2.3411526049051643e-05, + "loss": 0.060924801826477054, + "mean_token_accuracy": 0.9778030979633331, + "num_tokens": 9404737.0, + "step": 3200 + }, + { + "entropy": 0.11790491977706552, + "epoch": 8.441845354126055, + "grad_norm": 0.27871423959732056, + "learning_rate": 2.0039605757500512e-05, + "loss": 0.05871880531311035, + "mean_token_accuracy": 0.9786837643384934, + "num_tokens": 9552241.0, + "step": 3250 + }, + { + "entropy": 0.12911307733505964, + "epoch": 8.571799870045485, + "grad_norm": 0.4569152593612671, + "learning_rate": 1.691082982918235e-05, + "loss": 0.06450970649719238, + "mean_token_accuracy": 0.9761316785216332, + "num_tokens": 9689407.0, + "step": 3300 + }, + { + "entropy": 0.11636774389073253, + "epoch": 8.701754385964913, + "grad_norm": 0.27477338910102844, + "learning_rate": 1.403162711509129e-05, + "loss": 0.05784036159515381, + "mean_token_accuracy": 0.9791285961866378, + "num_tokens": 9842204.0, + "step": 3350 + }, + { + "entropy": 0.11710284009575844, + "epoch": 8.83170890188434, + "grad_norm": 0.26900890469551086, + "learning_rate": 1.1407913655766755e-05, + "loss": 0.05737146377563476, + "mean_token_accuracy": 0.9788037702441216, + "num_tokens": 9994524.0, + "step": 3400 + }, + { + "entropy": 0.11474769543856382, + "epoch": 8.961663417803768, + "grad_norm": 0.3137633204460144, + "learning_rate": 9.045080525311815e-06, + "loss": 0.057830405235290525, + "mean_token_accuracy": 0.9789029136300087, + "num_tokens": 10147597.0, + "step": 3450 + }, + { + "epoch": 9.0, + "eval_entropy": 0.23872991701444754, + "eval_loss": 1.007972002029419, + "eval_mean_token_accuracy": 0.8318104110658169, + "eval_num_tokens": 10189260.0, + "eval_runtime": 53.4047, + "eval_samples_per_second": 31.027, + "eval_steps_per_second": 3.895, + "step": 3465 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.43404829763072e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/README.md b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/adapter_config.json b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b7b923a301af4113e0aa591d097678b1fa73025c --- /dev/null +++ b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.009078376988692594, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "o_proj", + "k_proj", + "q_proj", + "gate_proj", + "up_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/chat_template.jinja b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/tokenizer_config.json b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/trainer_state.json b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d7047e60f094ad7d822bdcb7285421f0dbe913f2 --- /dev/null +++ b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/trainer_state.json @@ -0,0 +1,115 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 385, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.2479030179977415, + "epoch": 0.1299545159194282, + "grad_norm": 1.519571304321289, + "learning_rate": 3.522207847653314e-05, + "loss": 2.093206329345703, + "mean_token_accuracy": 0.6068353663384914, + "num_tokens": 154518.0, + "step": 50 + }, + { + "entropy": 0.932415626347065, + "epoch": 0.2599090318388564, + "grad_norm": 1.180830955505371, + "learning_rate": 7.11629748811588e-05, + "loss": 0.8930854797363281, + "mean_token_accuracy": 0.7708445385098457, + "num_tokens": 306733.0, + "step": 100 + }, + { + "entropy": 0.7730373838543891, + "epoch": 0.3898635477582846, + "grad_norm": 0.7839977145195007, + "learning_rate": 0.00010710387128578447, + "loss": 0.7302116394042969, + "mean_token_accuracy": 0.8012136635184288, + "num_tokens": 446267.0, + "step": 150 + }, + { + "entropy": 0.6934178560972214, + "epoch": 0.5198180636777128, + "grad_norm": 0.666778564453125, + "learning_rate": 0.0001430447676904101, + "loss": 0.6505754852294922, + "mean_token_accuracy": 0.8195212116837501, + "num_tokens": 600256.0, + "step": 200 + }, + { + "entropy": 0.6900296103954315, + "epoch": 0.649772579597141, + "grad_norm": 0.6762415766716003, + "learning_rate": 0.00017898566409503577, + "loss": 0.6378536987304687, + "mean_token_accuracy": 0.8223087686300278, + "num_tokens": 738649.0, + "step": 250 + }, + { + "entropy": 0.667421719878912, + "epoch": 0.7797270955165692, + "grad_norm": 0.5047685503959656, + "learning_rate": 0.00021492656049966144, + "loss": 0.6148524856567383, + "mean_token_accuracy": 0.8280292323231697, + "num_tokens": 883494.0, + "step": 300 + }, + { + "entropy": 0.6388977643847465, + "epoch": 0.9096816114359974, + "grad_norm": 0.4360353350639343, + "learning_rate": 0.0002508674569042871, + "loss": 0.5933729553222656, + "mean_token_accuracy": 0.8329134130477905, + "num_tokens": 1032111.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.6612381511009656, + "eval_loss": 0.6559221744537354, + "eval_mean_token_accuracy": 0.8195324820967821, + "eval_num_tokens": 1132140.0, + "eval_runtime": 53.4007, + "eval_samples_per_second": 31.03, + "eval_steps_per_second": 3.895, + "step": 385 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.91184805036032e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/README.md b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/adapter_config.json b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b7b923a301af4113e0aa591d097678b1fa73025c --- /dev/null +++ b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.009078376988692594, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "o_proj", + "k_proj", + "q_proj", + "gate_proj", + "up_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/chat_template.jinja b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/tokenizer_config.json b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/trainer_state.json b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7bcc12b96d5ae1ddf5b12c429240bd6898758939 --- /dev/null +++ b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/trainer_state.json @@ -0,0 +1,914 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 500, + "global_step": 3850, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.2479030179977415, + "epoch": 0.1299545159194282, + "grad_norm": 1.519571304321289, + "learning_rate": 3.522207847653314e-05, + "loss": 2.093206329345703, + "mean_token_accuracy": 0.6068353663384914, + "num_tokens": 154518.0, + "step": 50 + }, + { + "entropy": 0.932415626347065, + "epoch": 0.2599090318388564, + "grad_norm": 1.180830955505371, + "learning_rate": 7.11629748811588e-05, + "loss": 0.8930854797363281, + "mean_token_accuracy": 0.7708445385098457, + "num_tokens": 306733.0, + "step": 100 + }, + { + "entropy": 0.7730373838543891, + "epoch": 0.3898635477582846, + "grad_norm": 0.7839977145195007, + "learning_rate": 0.00010710387128578447, + "loss": 0.7302116394042969, + "mean_token_accuracy": 0.8012136635184288, + "num_tokens": 446267.0, + "step": 150 + }, + { + "entropy": 0.6934178560972214, + "epoch": 0.5198180636777128, + "grad_norm": 0.666778564453125, + "learning_rate": 0.0001430447676904101, + "loss": 0.6505754852294922, + "mean_token_accuracy": 0.8195212116837501, + "num_tokens": 600256.0, + "step": 200 + }, + { + "entropy": 0.6900296103954315, + "epoch": 0.649772579597141, + "grad_norm": 0.6762415766716003, + "learning_rate": 0.00017898566409503577, + "loss": 0.6378536987304687, + "mean_token_accuracy": 0.8223087686300278, + "num_tokens": 738649.0, + "step": 250 + }, + { + "entropy": 0.667421719878912, + "epoch": 0.7797270955165692, + "grad_norm": 0.5047685503959656, + "learning_rate": 0.00021492656049966144, + "loss": 0.6148524856567383, + "mean_token_accuracy": 0.8280292323231697, + "num_tokens": 883494.0, + "step": 300 + }, + { + "entropy": 0.6388977643847465, + "epoch": 0.9096816114359974, + "grad_norm": 0.4360353350639343, + "learning_rate": 0.0002508674569042871, + "loss": 0.5933729553222656, + "mean_token_accuracy": 0.8329134130477905, + "num_tokens": 1032111.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.6612381511009656, + "eval_loss": 0.6559221744537354, + "eval_mean_token_accuracy": 0.8195324820967821, + "eval_num_tokens": 1132140.0, + "eval_runtime": 53.4007, + "eval_samples_per_second": 31.03, + "eval_steps_per_second": 3.895, + "step": 385 + }, + { + "entropy": 0.6224366770916848, + "epoch": 1.0389863547758285, + "grad_norm": 0.5294668078422546, + "learning_rate": 0.00027673375518355765, + "loss": 0.5677951431274414, + "mean_token_accuracy": 0.8380465067211708, + "num_tokens": 1177556.0, + "step": 400 + }, + { + "entropy": 0.5827466724812984, + "epoch": 1.1689408706952567, + "grad_norm": 0.5172416567802429, + "learning_rate": 0.0002765120122346144, + "loss": 0.5423126983642578, + "mean_token_accuracy": 0.8467991036176682, + "num_tokens": 1325434.0, + "step": 450 + }, + { + "entropy": 0.5831517253816128, + "epoch": 1.2988953866146848, + "grad_norm": 0.41916292905807495, + "learning_rate": 0.0002760064270819138, + "loss": 0.534448013305664, + "mean_token_accuracy": 0.8456632816791534, + "num_tokens": 1474116.0, + "step": 500 + }, + { + "entropy": 0.5869986982643605, + "epoch": 1.428849902534113, + "grad_norm": 0.4387759566307068, + "learning_rate": 0.00027521803857633113, + "loss": 0.5367491912841796, + "mean_token_accuracy": 0.8462416216731071, + "num_tokens": 1621193.0, + "step": 550 + }, + { + "entropy": 0.5771756853163242, + "epoch": 1.5588044184535412, + "grad_norm": 0.49079665541648865, + "learning_rate": 0.00027414846665880935, + "loss": 0.5238623809814453, + "mean_token_accuracy": 0.84760089635849, + "num_tokens": 1767789.0, + "step": 600 + }, + { + "entropy": 0.5549105909466744, + "epoch": 1.6887589343729694, + "grad_norm": 0.4000363051891327, + "learning_rate": 0.0002727999090317863, + "loss": 0.510434226989746, + "mean_token_accuracy": 0.8517858856916427, + "num_tokens": 1918138.0, + "step": 650 + }, + { + "entropy": 0.583413660377264, + "epoch": 1.8187134502923976, + "grad_norm": 0.33592426776885986, + "learning_rate": 0.00027117513664346674, + "loss": 0.5297993850708008, + "mean_token_accuracy": 0.846615691781044, + "num_tokens": 2057575.0, + "step": 700 + }, + { + "entropy": 0.5732646904885769, + "epoch": 1.9486679662118258, + "grad_norm": 0.5528839230537415, + "learning_rate": 0.00026927748799421714, + "loss": 0.5219194793701172, + "mean_token_accuracy": 0.8489033079147339, + "num_tokens": 2208320.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.6027900550801021, + "eval_loss": 0.5946928858757019, + "eval_mean_token_accuracy": 0.8318195798649237, + "eval_num_tokens": 2264280.0, + "eval_runtime": 53.3837, + "eval_samples_per_second": 31.039, + "eval_steps_per_second": 3.896, + "step": 770 + }, + { + "entropy": 0.5329899657611272, + "epoch": 2.077972709551657, + "grad_norm": 0.45793575048446655, + "learning_rate": 0.0002671108622767842, + "loss": 0.48420516967773436, + "mean_token_accuracy": 0.8578200301333289, + "num_tokens": 2348248.0, + "step": 800 + }, + { + "entropy": 0.5142687204480171, + "epoch": 2.207927225471085, + "grad_norm": 0.4690960645675659, + "learning_rate": 0.0002646797113644295, + "loss": 0.4593114471435547, + "mean_token_accuracy": 0.8622670090198516, + "num_tokens": 2501427.0, + "step": 850 + }, + { + "entropy": 0.5135884112119675, + "epoch": 2.3378817413905133, + "grad_norm": 0.3752821683883667, + "learning_rate": 0.00026198903066344565, + "loss": 0.4626216125488281, + "mean_token_accuracy": 0.8612511262297631, + "num_tokens": 2650794.0, + "step": 900 + }, + { + "entropy": 0.5137367483973503, + "epoch": 2.4678362573099415, + "grad_norm": 0.3726271390914917, + "learning_rate": 0.0002590443488488465, + "loss": 0.4601683807373047, + "mean_token_accuracy": 0.8620512077212333, + "num_tokens": 2798180.0, + "step": 950 + }, + { + "entropy": 0.5105714881420136, + "epoch": 2.5977907732293697, + "grad_norm": 0.41296717524528503, + "learning_rate": 0.00025585171650432525, + "loss": 0.46279102325439453, + "mean_token_accuracy": 0.8611763519048691, + "num_tokens": 2950301.0, + "step": 1000 + }, + { + "entropy": 0.5169161760807037, + "epoch": 2.727745289148798, + "grad_norm": 0.4614253044128418, + "learning_rate": 0.0002524176936898197, + "loss": 0.45492774963378907, + "mean_token_accuracy": 0.8627680170536042, + "num_tokens": 3091810.0, + "step": 1050 + }, + { + "entropy": 0.4989277676492929, + "epoch": 2.857699805068226, + "grad_norm": 0.37512704730033875, + "learning_rate": 0.00024874933646223225, + "loss": 0.4531984329223633, + "mean_token_accuracy": 0.8637665447592735, + "num_tokens": 3242184.0, + "step": 1100 + }, + { + "entropy": 0.5177617704868317, + "epoch": 2.9876543209876543, + "grad_norm": 0.3700532019138336, + "learning_rate": 0.00024485418237699976, + "loss": 0.45844474792480466, + "mean_token_accuracy": 0.8626988258957863, + "num_tokens": 3382605.0, + "step": 1150 + }, + { + "epoch": 3.0, + "eval_entropy": 0.5253989950108987, + "eval_loss": 0.5857328176498413, + "eval_mean_token_accuracy": 0.8360884573597175, + "eval_num_tokens": 3396420.0, + "eval_runtime": 53.341, + "eval_samples_per_second": 31.064, + "eval_steps_per_second": 3.899, + "step": 1155 + }, + { + "entropy": 0.4535361140517134, + "epoch": 3.116959064327485, + "grad_norm": 0.3412795662879944, + "learning_rate": 0.00024074023500030492, + "loss": 0.3942829132080078, + "mean_token_accuracy": 0.8781378038564519, + "num_tokens": 3522582.0, + "step": 1200 + }, + { + "entropy": 0.44747897461056707, + "epoch": 3.246913580246914, + "grad_norm": 0.46647050976753235, + "learning_rate": 0.0002364159474637521, + "loss": 0.38986759185791015, + "mean_token_accuracy": 0.8777281475067139, + "num_tokens": 3670864.0, + "step": 1250 + }, + { + "entropy": 0.4480265176296234, + "epoch": 3.3768680961663415, + "grad_norm": 0.4068582355976105, + "learning_rate": 0.00023189020509529866, + "loss": 0.39444759368896487, + "mean_token_accuracy": 0.8774515727162361, + "num_tokens": 3822021.0, + "step": 1300 + }, + { + "entropy": 0.45180007234215736, + "epoch": 3.50682261208577, + "grad_norm": 0.4249928593635559, + "learning_rate": 0.00022717230716213122, + "loss": 0.3977077102661133, + "mean_token_accuracy": 0.8762744688987731, + "num_tokens": 3968736.0, + "step": 1350 + }, + { + "entropy": 0.4614932192862034, + "epoch": 3.636777128005198, + "grad_norm": 0.561008095741272, + "learning_rate": 0.00022227194776300045, + "loss": 0.4022808456420898, + "mean_token_accuracy": 0.8760285252332687, + "num_tokens": 4113509.0, + "step": 1400 + }, + { + "entropy": 0.4414680179953575, + "epoch": 3.7667316439246266, + "grad_norm": 0.38943538069725037, + "learning_rate": 0.00021719919590927584, + "loss": 0.38586376190185545, + "mean_token_accuracy": 0.8783121705055237, + "num_tokens": 4267958.0, + "step": 1450 + }, + { + "entropy": 0.45685607343912127, + "epoch": 3.8966861598440543, + "grad_norm": 0.5362406969070435, + "learning_rate": 0.00021196447483564875, + "loss": 0.3983576583862305, + "mean_token_accuracy": 0.8764419692754746, + "num_tokens": 4415398.0, + "step": 1500 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49981127894268584, + "eval_loss": 0.5997208952903748, + "eval_mean_token_accuracy": 0.8368159819107789, + "eval_num_tokens": 4528560.0, + "eval_runtime": 53.4304, + "eval_samples_per_second": 31.012, + "eval_steps_per_second": 3.893, + "step": 1540 + }, + { + "entropy": 0.4439909208060509, + "epoch": 4.025990903183885, + "grad_norm": 0.5490113496780396, + "learning_rate": 0.00020657854058299564, + "loss": 0.38307292938232423, + "mean_token_accuracy": 0.8795150506436525, + "num_tokens": 4559534.0, + "step": 1550 + }, + { + "entropy": 0.3837250977009535, + "epoch": 4.155945419103314, + "grad_norm": 0.5567234754562378, + "learning_rate": 0.0002010524598974076, + "loss": 0.3182963752746582, + "mean_token_accuracy": 0.8964017608761787, + "num_tokens": 4707075.0, + "step": 1600 + }, + { + "entropy": 0.377872094810009, + "epoch": 4.2858999350227425, + "grad_norm": 0.4315710961818695, + "learning_rate": 0.00019539758749079845, + "loss": 0.318333683013916, + "mean_token_accuracy": 0.8963816618919372, + "num_tokens": 4851683.0, + "step": 1650 + }, + { + "entropy": 0.38739304527640345, + "epoch": 4.41585445094217, + "grad_norm": 0.49140632152557373, + "learning_rate": 0.00018962554270981555, + "loss": 0.32688804626464846, + "mean_token_accuracy": 0.8937860554456711, + "num_tokens": 4994086.0, + "step": 1700 + }, + { + "entropy": 0.39157475270330905, + "epoch": 4.545808966861598, + "grad_norm": 0.40667369961738586, + "learning_rate": 0.00018374818566099208, + "loss": 0.3305763626098633, + "mean_token_accuracy": 0.8916732975840569, + "num_tokens": 5137171.0, + "step": 1750 + }, + { + "entropy": 0.3838599680364132, + "epoch": 4.675763482781027, + "grad_norm": 0.4632417857646942, + "learning_rate": 0.0001777775928411983, + "loss": 0.3267818450927734, + "mean_token_accuracy": 0.8946500706672669, + "num_tokens": 5287076.0, + "step": 1800 + }, + { + "entropy": 0.38270787581801413, + "epoch": 4.805717998700455, + "grad_norm": 0.5529720187187195, + "learning_rate": 0.0001717260323234649, + "loss": 0.3264235305786133, + "mean_token_accuracy": 0.8948800846934318, + "num_tokens": 5436923.0, + "step": 1850 + }, + { + "entropy": 0.38736109718680384, + "epoch": 4.935672514619883, + "grad_norm": 0.5604785680770874, + "learning_rate": 0.00016560593854916497, + "loss": 0.3280513381958008, + "mean_token_accuracy": 0.8931388029456139, + "num_tokens": 5589195.0, + "step": 1900 + }, + { + "epoch": 5.0, + "eval_entropy": 0.4370518812479881, + "eval_loss": 0.6210553050041199, + "eval_mean_token_accuracy": 0.8379779781859654, + "eval_num_tokens": 5660700.0, + "eval_runtime": 53.4039, + "eval_samples_per_second": 31.028, + "eval_steps_per_second": 3.895, + "step": 1925 + }, + { + "entropy": 0.3330705868988181, + "epoch": 5.064977257959714, + "grad_norm": 0.5386723875999451, + "learning_rate": 0.0001594298867783512, + "loss": 0.2754818344116211, + "mean_token_accuracy": 0.9101201346771202, + "num_tokens": 5739445.0, + "step": 1950 + }, + { + "entropy": 0.2980620255321264, + "epoch": 5.1949317738791425, + "grad_norm": 0.5633581876754761, + "learning_rate": 0.00015321056725074549, + "loss": 0.23754241943359375, + "mean_token_accuracy": 0.9203532826900482, + "num_tokens": 5888043.0, + "step": 2000 + }, + { + "entropy": 0.3111592583358288, + "epoch": 5.32488628979857, + "grad_norm": 0.5031015872955322, + "learning_rate": 0.0001469607591104745, + "loss": 0.24428102493286133, + "mean_token_accuracy": 0.917181601524353, + "num_tokens": 6031284.0, + "step": 2050 + }, + { + "entropy": 0.31522042460739613, + "epoch": 5.454840805717999, + "grad_norm": 0.6432453393936157, + "learning_rate": 0.0001406933041481286, + "loss": 0.25112478256225584, + "mean_token_accuracy": 0.9152472382783889, + "num_tokens": 6179927.0, + "step": 2100 + }, + { + "entropy": 0.3046229027956724, + "epoch": 5.584795321637427, + "grad_norm": 0.5104537606239319, + "learning_rate": 0.00013442108041409814, + "loss": 0.2431495475769043, + "mean_token_accuracy": 0.917829519212246, + "num_tokens": 6322630.0, + "step": 2150 + }, + { + "entropy": 0.30440517760813235, + "epoch": 5.714749837556855, + "grad_norm": 0.5307765603065491, + "learning_rate": 0.0001281569757574053, + "loss": 0.24610313415527343, + "mean_token_accuracy": 0.9166415151953697, + "num_tokens": 6469843.0, + "step": 2200 + }, + { + "entropy": 0.304359400421381, + "epoch": 5.844704353476283, + "grad_norm": 0.5014523267745972, + "learning_rate": 0.00012191386134440133, + "loss": 0.24548973083496095, + "mean_token_accuracy": 0.9165477308630944, + "num_tokens": 6617768.0, + "step": 2250 + }, + { + "entropy": 0.3141418205201626, + "epoch": 5.974658869395712, + "grad_norm": 0.567398726940155, + "learning_rate": 0.00011570456521174339, + "loss": 0.24975168228149414, + "mean_token_accuracy": 0.9139353120326996, + "num_tokens": 6761187.0, + "step": 2300 + }, + { + "epoch": 6.0, + "eval_entropy": 0.3758909202252443, + "eval_loss": 0.6843022108078003, + "eval_mean_token_accuracy": 0.8348075449466705, + "eval_num_tokens": 6792840.0, + "eval_runtime": 53.3825, + "eval_samples_per_second": 31.04, + "eval_steps_per_second": 3.896, + "step": 2310 + }, + { + "entropy": 0.24346149432000203, + "epoch": 6.1039636127355426, + "grad_norm": 0.7140825986862183, + "learning_rate": 0.00010954184590799172, + "loss": 0.17231273651123047, + "mean_token_accuracy": 0.9407721275660261, + "num_tokens": 6909578.0, + "step": 2350 + }, + { + "entropy": 0.2160973483324051, + "epoch": 6.23391812865497, + "grad_norm": 0.49014952778816223, + "learning_rate": 0.00010343836627798716, + "loss": 0.15455107688903807, + "mean_token_accuracy": 0.9467655989527702, + "num_tokens": 7056244.0, + "step": 2400 + }, + { + "entropy": 0.21491924367845058, + "epoch": 6.363872644574399, + "grad_norm": 0.5529471635818481, + "learning_rate": 9.740666744387656e-05, + "loss": 0.1584029197692871, + "mean_token_accuracy": 0.9460993978381157, + "num_tokens": 7206950.0, + "step": 2450 + }, + { + "entropy": 0.22037068914622068, + "epoch": 6.493827160493828, + "grad_norm": 0.6232843995094299, + "learning_rate": 9.145914303624717e-05, + "loss": 0.15544342041015624, + "mean_token_accuracy": 0.9450622496008872, + "num_tokens": 7359429.0, + "step": 2500 + }, + { + "entropy": 0.2320463878661394, + "epoch": 6.623781676413255, + "grad_norm": 0.7459681630134583, + "learning_rate": 8.560801372831975e-05, + "loss": 0.16350215911865235, + "mean_token_accuracy": 0.9416968420147895, + "num_tokens": 7499281.0, + "step": 2550 + }, + { + "entropy": 0.22943626195192338, + "epoch": 6.753736192332683, + "grad_norm": 0.7482302784919739, + "learning_rate": 7.986530212552506e-05, + "loss": 0.16422538757324218, + "mean_token_accuracy": 0.9434959614276885, + "num_tokens": 7640758.0, + "step": 2600 + }, + { + "entropy": 0.21795938543975354, + "epoch": 6.883690708252112, + "grad_norm": 0.5210486054420471, + "learning_rate": 7.424280806206118e-05, + "loss": 0.15540474891662598, + "mean_token_accuracy": 0.9459306105971337, + "num_tokens": 7791986.0, + "step": 2650 + }, + { + "epoch": 7.0, + "eval_entropy": 0.3157534837149657, + "eval_loss": 0.7744874954223633, + "eval_mean_token_accuracy": 0.8346395036922052, + "eval_num_tokens": 7924980.0, + "eval_runtime": 53.3771, + "eval_samples_per_second": 31.043, + "eval_steps_per_second": 3.897, + "step": 2695 + }, + { + "entropy": 0.2124774060656677, + "epoch": 7.012995451591943, + "grad_norm": 0.42238789796829224, + "learning_rate": 6.875208435518865e-05, + "loss": 0.14792531967163086, + "mean_token_accuracy": 0.9490461115861059, + "num_tokens": 7940521.0, + "step": 2700 + }, + { + "entropy": 0.15692965138703585, + "epoch": 7.142949967511371, + "grad_norm": 0.4711572229862213, + "learning_rate": 6.340441306708468e-05, + "loss": 0.09051708221435546, + "mean_token_accuracy": 0.9700166273117066, + "num_tokens": 8084193.0, + "step": 2750 + }, + { + "entropy": 0.15241683423519134, + "epoch": 7.272904483430799, + "grad_norm": 0.4312196671962738, + "learning_rate": 5.821078232303016e-05, + "loss": 0.08812363624572754, + "mean_token_accuracy": 0.9699361199140548, + "num_tokens": 8230159.0, + "step": 2800 + }, + { + "entropy": 0.1459079357981682, + "epoch": 7.402858999350228, + "grad_norm": 0.4804084002971649, + "learning_rate": 5.3181863733564636e-05, + "loss": 0.08675944328308105, + "mean_token_accuracy": 0.9703072866797448, + "num_tokens": 8380556.0, + "step": 2850 + }, + { + "entropy": 0.15621621005237102, + "epoch": 7.532813515269655, + "grad_norm": 0.5435478091239929, + "learning_rate": 4.83279904669986e-05, + "loss": 0.09016354560852051, + "mean_token_accuracy": 0.9674961140751839, + "num_tokens": 8523248.0, + "step": 2900 + }, + { + "entropy": 0.1545175113901496, + "epoch": 7.662768031189084, + "grad_norm": 0.524286687374115, + "learning_rate": 4.365913601734056e-05, + "loss": 0.09049141883850098, + "mean_token_accuracy": 0.9679373624920845, + "num_tokens": 8672002.0, + "step": 2950 + }, + { + "entropy": 0.1553485019877553, + "epoch": 7.792722547108512, + "grad_norm": 0.5006484389305115, + "learning_rate": 3.9184893711264495e-05, + "loss": 0.08913107872009278, + "mean_token_accuracy": 0.9684559822082519, + "num_tokens": 8816090.0, + "step": 3000 + }, + { + "entropy": 0.15444331549108029, + "epoch": 7.92267706302794, + "grad_norm": 0.5613893866539001, + "learning_rate": 3.491445699622611e-05, + "loss": 0.08711207389831543, + "mean_token_accuracy": 0.9684525722265244, + "num_tokens": 8966004.0, + "step": 3050 + }, + { + "epoch": 8.0, + "eval_entropy": 0.26580205430778175, + "eval_loss": 0.8996392488479614, + "eval_mean_token_accuracy": 0.8319417791297803, + "eval_num_tokens": 9057120.0, + "eval_runtime": 53.4042, + "eval_samples_per_second": 31.028, + "eval_steps_per_second": 3.895, + "step": 3080 + }, + { + "entropy": 0.14060429509860187, + "epoch": 8.05198180636777, + "grad_norm": 0.25378674268722534, + "learning_rate": 3.085660055023035e-05, + "loss": 0.07468742847442628, + "mean_token_accuracy": 0.9737296260181983, + "num_tokens": 9115929.0, + "step": 3100 + }, + { + "entropy": 0.12152639016509056, + "epoch": 8.1819363222872, + "grad_norm": 0.3634810447692871, + "learning_rate": 2.7019662252065798e-05, + "loss": 0.05918361663818359, + "mean_token_accuracy": 0.9785374769568443, + "num_tokens": 9260646.0, + "step": 3150 + }, + { + "entropy": 0.1228457903675735, + "epoch": 8.311890838206628, + "grad_norm": 0.32873299717903137, + "learning_rate": 2.3411526049051643e-05, + "loss": 0.060924801826477054, + "mean_token_accuracy": 0.9778030979633331, + "num_tokens": 9404737.0, + "step": 3200 + }, + { + "entropy": 0.11790491977706552, + "epoch": 8.441845354126055, + "grad_norm": 0.27871423959732056, + "learning_rate": 2.0039605757500512e-05, + "loss": 0.05871880531311035, + "mean_token_accuracy": 0.9786837643384934, + "num_tokens": 9552241.0, + "step": 3250 + }, + { + "entropy": 0.12911307733505964, + "epoch": 8.571799870045485, + "grad_norm": 0.4569152593612671, + "learning_rate": 1.691082982918235e-05, + "loss": 0.06450970649719238, + "mean_token_accuracy": 0.9761316785216332, + "num_tokens": 9689407.0, + "step": 3300 + }, + { + "entropy": 0.11636774389073253, + "epoch": 8.701754385964913, + "grad_norm": 0.27477338910102844, + "learning_rate": 1.403162711509129e-05, + "loss": 0.05784036159515381, + "mean_token_accuracy": 0.9791285961866378, + "num_tokens": 9842204.0, + "step": 3350 + }, + { + "entropy": 0.11710284009575844, + "epoch": 8.83170890188434, + "grad_norm": 0.26900890469551086, + "learning_rate": 1.1407913655766755e-05, + "loss": 0.05737146377563476, + "mean_token_accuracy": 0.9788037702441216, + "num_tokens": 9994524.0, + "step": 3400 + }, + { + "entropy": 0.11474769543856382, + "epoch": 8.961663417803768, + "grad_norm": 0.3137633204460144, + "learning_rate": 9.045080525311815e-06, + "loss": 0.057830405235290525, + "mean_token_accuracy": 0.9789029136300087, + "num_tokens": 10147597.0, + "step": 3450 + }, + { + "epoch": 9.0, + "eval_entropy": 0.23872991701444754, + "eval_loss": 1.007972002029419, + "eval_mean_token_accuracy": 0.8318104110658169, + "eval_num_tokens": 10189260.0, + "eval_runtime": 53.4047, + "eval_samples_per_second": 31.027, + "eval_steps_per_second": 3.895, + "step": 3465 + }, + { + "entropy": 0.10747776249769944, + "epoch": 9.0909681611436, + "grad_norm": 0.21027140319347382, + "learning_rate": 6.9479827540858e-06, + "loss": 0.05341584682464599, + "mean_token_accuracy": 0.9819158309668152, + "num_tokens": 10294278.0, + "step": 3500 + }, + { + "entropy": 0.10646019088104368, + "epoch": 9.220922677063028, + "grad_norm": 0.1820213794708252, + "learning_rate": 5.120929352832946e-06, + "loss": 0.04993132591247559, + "mean_token_accuracy": 0.9813734939694405, + "num_tokens": 10446433.0, + "step": 3550 + }, + { + "entropy": 0.11276215925812721, + "epoch": 9.350877192982455, + "grad_norm": 0.28350868821144104, + "learning_rate": 3.5676744587442527e-06, + "loss": 0.05132147789001465, + "mean_token_accuracy": 0.9798016020655632, + "num_tokens": 10596480.0, + "step": 3600 + }, + { + "entropy": 0.11684846783056856, + "epoch": 9.480831708901885, + "grad_norm": 0.24154022336006165, + "learning_rate": 2.2914096216458985e-06, + "loss": 0.05330245018005371, + "mean_token_accuracy": 0.9795207896828652, + "num_tokens": 10740528.0, + "step": 3650 + }, + { + "entropy": 0.11312318585813046, + "epoch": 9.610786224821313, + "grad_norm": 0.28576743602752686, + "learning_rate": 1.2947572461634096e-06, + "loss": 0.05499160289764404, + "mean_token_accuracy": 0.9797447052598, + "num_tokens": 10881750.0, + "step": 3700 + }, + { + "entropy": 0.11110325066372752, + "epoch": 9.74074074074074, + "grad_norm": 0.22744601964950562, + "learning_rate": 5.79765203336998e-07, + "loss": 0.05335733413696289, + "mean_token_accuracy": 0.9804390069842338, + "num_tokens": 11027552.0, + "step": 3750 + }, + { + "entropy": 0.11025484301149845, + "epoch": 9.870695256660168, + "grad_norm": 0.2376026064157486, + "learning_rate": 1.4790262275940392e-07, + "loss": 0.053631534576416014, + "mean_token_accuracy": 0.9807139033079147, + "num_tokens": 11172587.0, + "step": 3800 + }, + { + "entropy": 0.11222033412312743, + "epoch": 10.0, + "grad_norm": 0.22163568437099457, + "learning_rate": 5.6873882486966634e-11, + "loss": 0.05134555339813232, + "mean_token_accuracy": 0.9805273500519183, + "num_tokens": 11321400.0, + "step": 3850 + }, + { + "epoch": 10.0, + "eval_entropy": 0.23073764632527644, + "eval_loss": 1.059328556060791, + "eval_mean_token_accuracy": 0.8315239631785796, + "eval_num_tokens": 11321400.0, + "eval_runtime": 53.4589, + "eval_samples_per_second": 30.996, + "eval_steps_per_second": 3.891, + "step": 3850 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.925137653703373e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7ee79256b30e387ac41d2786d79a749c70114aaa --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/README.md @@ -0,0 +1,58 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: transformers +model_name: Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1 +tags: +- generated_from_trainer +- sft +- trl +licence: license +--- + +# Model Card for Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1 + +This model is a fine-tuned version of [Qwen/Qwen3-4B-Base](https://huggingface.co/Qwen/Qwen3-4B-Base). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/katriin-kukk/Cross_lingual_morphological_generalization/runs/ranbbj8x) + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.29.0 +- Transformers: 5.5.4 +- Pytorch: 2.10.0 +- Datasets: 4.6.1 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-1122/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-1122/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-1122/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-1122/tokenizer_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-1122/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-1122/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6f9710c7923482a2e6cb286018e55b18316a2856 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/README.md @@ -0,0 +1,58 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: transformers +model_name: Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2 +tags: +- generated_from_trainer +- sft +- trl +licence: license +--- + +# Model Card for Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2 + +This model is a fine-tuned version of [Qwen/Qwen3-4B-Base](https://huggingface.co/Qwen/Qwen3-4B-Base). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/katriin-kukk/Cross_lingual_morphological_generalization/runs/y6cm94yy) + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.29.0 +- Transformers: 5.5.4 +- Pytorch: 2.10.0 +- Datasets: 4.6.1 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/adapter_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8818eeae60314bb5e6cd8bb9d31975488784721f --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.0094403300459725, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "q_proj", + "gate_proj", + "o_proj", + "v_proj", + "k_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/chat_template.jinja b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/tokenizer_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/trainer_state.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7efdc248db95c46bb6a9070adb343c159b1896cc --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/trainer_state.json @@ -0,0 +1,287 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1122, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.2357245934009553, + "epoch": 0.13386880856760375, + "grad_norm": 1.1940524578094482, + "learning_rate": 3.0637587087867373e-05, + "loss": 2.0509376525878906, + "mean_token_accuracy": 0.6194104523956776, + "num_tokens": 127704.0, + "step": 50 + }, + { + "entropy": 0.8588631230592728, + "epoch": 0.2677376171352075, + "grad_norm": 0.6621095538139343, + "learning_rate": 6.190043105507899e-05, + "loss": 0.8043325805664062, + "mean_token_accuracy": 0.7855384379625321, + "num_tokens": 256077.0, + "step": 100 + }, + { + "entropy": 0.6726470375061036, + "epoch": 0.40160642570281124, + "grad_norm": 0.5255736112594604, + "learning_rate": 9.316327502229059e-05, + "loss": 0.6415129089355469, + "mean_token_accuracy": 0.8183896672725678, + "num_tokens": 387735.0, + "step": 150 + }, + { + "entropy": 0.6119109469652176, + "epoch": 0.535475234270415, + "grad_norm": 0.45975860953330994, + "learning_rate": 0.0001244261189895022, + "loss": 0.576114501953125, + "mean_token_accuracy": 0.8376823288202285, + "num_tokens": 522202.0, + "step": 200 + }, + { + "entropy": 0.5972725109755993, + "epoch": 0.6693440428380187, + "grad_norm": 0.40055611729621887, + "learning_rate": 0.0001556889629567138, + "loss": 0.5613529205322265, + "mean_token_accuracy": 0.8418879929184914, + "num_tokens": 648663.0, + "step": 250 + }, + { + "entropy": 0.5673307004570961, + "epoch": 0.8032128514056225, + "grad_norm": 0.7427454590797424, + "learning_rate": 0.0001869518069239254, + "loss": 0.5325925827026368, + "mean_token_accuracy": 0.8488189685344696, + "num_tokens": 778245.0, + "step": 300 + }, + { + "entropy": 0.5615521620213986, + "epoch": 0.9370816599732262, + "grad_norm": 0.4017025828361511, + "learning_rate": 0.000218214650891137, + "loss": 0.522266616821289, + "mean_token_accuracy": 0.8500416606664658, + "num_tokens": 905328.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5758187392354012, + "eval_loss": 0.5664522051811218, + "eval_mean_token_accuracy": 0.8383084440231323, + "eval_num_tokens": 959743.0, + "eval_runtime": 51.7964, + "eval_samples_per_second": 30.852, + "eval_steps_per_second": 3.861, + "step": 374 + }, + { + "entropy": 0.5440343178883947, + "epoch": 1.069611780455154, + "grad_norm": 0.3084093928337097, + "learning_rate": 0.00023381424541885068, + "loss": 0.5051319122314453, + "mean_token_accuracy": 0.8554374280602041, + "num_tokens": 1022581.0, + "step": 400 + }, + { + "entropy": 0.5245272906124592, + "epoch": 1.2034805890227578, + "grad_norm": 0.3342743515968323, + "learning_rate": 0.00023355972972676628, + "loss": 0.4855318450927734, + "mean_token_accuracy": 0.8584430786967278, + "num_tokens": 1151430.0, + "step": 450 + }, + { + "entropy": 0.5234513898193837, + "epoch": 1.3373493975903614, + "grad_norm": 0.26953577995300293, + "learning_rate": 0.00023305125251804043, + "loss": 0.4811768341064453, + "mean_token_accuracy": 0.8606387570500373, + "num_tokens": 1282481.0, + "step": 500 + }, + { + "entropy": 0.5147616830468178, + "epoch": 1.4712182061579653, + "grad_norm": 0.28791534900665283, + "learning_rate": 0.0002322899209369128, + "loss": 0.4762062835693359, + "mean_token_accuracy": 0.8612849581241607, + "num_tokens": 1414166.0, + "step": 550 + }, + { + "entropy": 0.5060296922922134, + "epoch": 1.605087014725569, + "grad_norm": 0.2496163249015808, + "learning_rate": 0.0002312773926857543, + "loss": 0.4695176315307617, + "mean_token_accuracy": 0.8636157616972924, + "num_tokens": 1547902.0, + "step": 600 + }, + { + "entropy": 0.5008939932286739, + "epoch": 1.7389558232931726, + "grad_norm": 0.2329542636871338, + "learning_rate": 0.00023001587241563198, + "loss": 0.46317913055419924, + "mean_token_accuracy": 0.8652430367469788, + "num_tokens": 1678635.0, + "step": 650 + }, + { + "entropy": 0.4986082436144352, + "epoch": 1.8728246318607764, + "grad_norm": 0.2459402084350586, + "learning_rate": 0.00022850810692596235, + "loss": 0.4617066192626953, + "mean_token_accuracy": 0.8672981086373329, + "num_tokens": 1803328.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.546445109397173, + "eval_loss": 0.5307288765907288, + "eval_mean_token_accuracy": 0.8434162598848343, + "eval_num_tokens": 1919486.0, + "eval_runtime": 51.3627, + "eval_samples_per_second": 31.112, + "eval_steps_per_second": 3.894, + "step": 748 + }, + { + "entropy": 0.5069879525237613, + "epoch": 2.005354752342704, + "grad_norm": 0.21533548831939697, + "learning_rate": 0.00022675737918370628, + "loss": 0.4585062026977539, + "mean_token_accuracy": 0.865652882390552, + "num_tokens": 1925307.0, + "step": 750 + }, + { + "entropy": 0.4457989126443863, + "epoch": 2.139223560910308, + "grad_norm": 0.24705350399017334, + "learning_rate": 0.00022476750117512737, + "loss": 0.4026710891723633, + "mean_token_accuracy": 0.8774338760972022, + "num_tokens": 2055627.0, + "step": 800 + }, + { + "entropy": 0.45084999009966853, + "epoch": 2.2730923694779115, + "grad_norm": 0.26643863320350647, + "learning_rate": 0.00022254280560567822, + "loss": 0.40950340270996094, + "mean_token_accuracy": 0.8752825647592545, + "num_tokens": 2177850.0, + "step": 850 + }, + { + "entropy": 0.4516983331739903, + "epoch": 2.4069611780455156, + "grad_norm": 0.26972198486328125, + "learning_rate": 0.00022008813646608725, + "loss": 0.4115512466430664, + "mean_token_accuracy": 0.8761435833573341, + "num_tokens": 2304612.0, + "step": 900 + }, + { + "entropy": 0.45280722543597224, + "epoch": 2.540829986613119, + "grad_norm": 0.2643600106239319, + "learning_rate": 0.00021740883848518684, + "loss": 0.41181053161621095, + "mean_token_accuracy": 0.8756420350074768, + "num_tokens": 2430946.0, + "step": 950 + }, + { + "entropy": 0.4487862553447485, + "epoch": 2.674698795180723, + "grad_norm": 0.2849285304546356, + "learning_rate": 0.00021451074549244846, + "loss": 0.4094270706176758, + "mean_token_accuracy": 0.8771369129419326, + "num_tokens": 2557241.0, + "step": 1000 + }, + { + "entropy": 0.45000545382499696, + "epoch": 2.8085676037483265, + "grad_norm": 0.24081671237945557, + "learning_rate": 0.0002114001677155633, + "loss": 0.4073855972290039, + "mean_token_accuracy": 0.8779223081469536, + "num_tokens": 2692775.0, + "step": 1050 + }, + { + "entropy": 0.4576124830543995, + "epoch": 2.9424364123159306, + "grad_norm": 0.2341010719537735, + "learning_rate": 0.00020808387804072673, + "loss": 0.4154107666015625, + "mean_token_accuracy": 0.8756425747275353, + "num_tokens": 2823060.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.4812743577361107, + "eval_loss": 0.523208498954773, + "eval_mean_token_accuracy": 0.8494922530651092, + "eval_num_tokens": 2879229.0, + "eval_runtime": 51.3707, + "eval_samples_per_second": 31.107, + "eval_steps_per_second": 3.893, + "step": 1122 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.23531258243029e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/adapter_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8818eeae60314bb5e6cd8bb9d31975488784721f --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.0094403300459725, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "q_proj", + "gate_proj", + "o_proj", + "v_proj", + "k_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/chat_template.jinja b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/tokenizer_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/trainer_state.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8579a5b3822c16431e12b7178ef55a17371e2072 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/trainer_state.json @@ -0,0 +1,368 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 1496, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.2357245934009553, + "epoch": 0.13386880856760375, + "grad_norm": 1.1940524578094482, + "learning_rate": 3.0637587087867373e-05, + "loss": 2.0509376525878906, + "mean_token_accuracy": 0.6194104523956776, + "num_tokens": 127704.0, + "step": 50 + }, + { + "entropy": 0.8588631230592728, + "epoch": 0.2677376171352075, + "grad_norm": 0.6621095538139343, + "learning_rate": 6.190043105507899e-05, + "loss": 0.8043325805664062, + "mean_token_accuracy": 0.7855384379625321, + "num_tokens": 256077.0, + "step": 100 + }, + { + "entropy": 0.6726470375061036, + "epoch": 0.40160642570281124, + "grad_norm": 0.5255736112594604, + "learning_rate": 9.316327502229059e-05, + "loss": 0.6415129089355469, + "mean_token_accuracy": 0.8183896672725678, + "num_tokens": 387735.0, + "step": 150 + }, + { + "entropy": 0.6119109469652176, + "epoch": 0.535475234270415, + "grad_norm": 0.45975860953330994, + "learning_rate": 0.0001244261189895022, + "loss": 0.576114501953125, + "mean_token_accuracy": 0.8376823288202285, + "num_tokens": 522202.0, + "step": 200 + }, + { + "entropy": 0.5972725109755993, + "epoch": 0.6693440428380187, + "grad_norm": 0.40055611729621887, + "learning_rate": 0.0001556889629567138, + "loss": 0.5613529205322265, + "mean_token_accuracy": 0.8418879929184914, + "num_tokens": 648663.0, + "step": 250 + }, + { + "entropy": 0.5673307004570961, + "epoch": 0.8032128514056225, + "grad_norm": 0.7427454590797424, + "learning_rate": 0.0001869518069239254, + "loss": 0.5325925827026368, + "mean_token_accuracy": 0.8488189685344696, + "num_tokens": 778245.0, + "step": 300 + }, + { + "entropy": 0.5615521620213986, + "epoch": 0.9370816599732262, + "grad_norm": 0.4017025828361511, + "learning_rate": 0.000218214650891137, + "loss": 0.522266616821289, + "mean_token_accuracy": 0.8500416606664658, + "num_tokens": 905328.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5758187392354012, + "eval_loss": 0.5664522051811218, + "eval_mean_token_accuracy": 0.8383084440231323, + "eval_num_tokens": 959743.0, + "eval_runtime": 51.7964, + "eval_samples_per_second": 30.852, + "eval_steps_per_second": 3.861, + "step": 374 + }, + { + "entropy": 0.5440343178883947, + "epoch": 1.069611780455154, + "grad_norm": 0.3084093928337097, + "learning_rate": 0.00023381424541885068, + "loss": 0.5051319122314453, + "mean_token_accuracy": 0.8554374280602041, + "num_tokens": 1022581.0, + "step": 400 + }, + { + "entropy": 0.5245272906124592, + "epoch": 1.2034805890227578, + "grad_norm": 0.3342743515968323, + "learning_rate": 0.00023355972972676628, + "loss": 0.4855318450927734, + "mean_token_accuracy": 0.8584430786967278, + "num_tokens": 1151430.0, + "step": 450 + }, + { + "entropy": 0.5234513898193837, + "epoch": 1.3373493975903614, + "grad_norm": 0.26953577995300293, + "learning_rate": 0.00023305125251804043, + "loss": 0.4811768341064453, + "mean_token_accuracy": 0.8606387570500373, + "num_tokens": 1282481.0, + "step": 500 + }, + { + "entropy": 0.5147616830468178, + "epoch": 1.4712182061579653, + "grad_norm": 0.28791534900665283, + "learning_rate": 0.0002322899209369128, + "loss": 0.4762062835693359, + "mean_token_accuracy": 0.8612849581241607, + "num_tokens": 1414166.0, + "step": 550 + }, + { + "entropy": 0.5060296922922134, + "epoch": 1.605087014725569, + "grad_norm": 0.2496163249015808, + "learning_rate": 0.0002312773926857543, + "loss": 0.4695176315307617, + "mean_token_accuracy": 0.8636157616972924, + "num_tokens": 1547902.0, + "step": 600 + }, + { + "entropy": 0.5008939932286739, + "epoch": 1.7389558232931726, + "grad_norm": 0.2329542636871338, + "learning_rate": 0.00023001587241563198, + "loss": 0.46317913055419924, + "mean_token_accuracy": 0.8652430367469788, + "num_tokens": 1678635.0, + "step": 650 + }, + { + "entropy": 0.4986082436144352, + "epoch": 1.8728246318607764, + "grad_norm": 0.2459402084350586, + "learning_rate": 0.00022850810692596235, + "loss": 0.4617066192626953, + "mean_token_accuracy": 0.8672981086373329, + "num_tokens": 1803328.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.546445109397173, + "eval_loss": 0.5307288765907288, + "eval_mean_token_accuracy": 0.8434162598848343, + "eval_num_tokens": 1919486.0, + "eval_runtime": 51.3627, + "eval_samples_per_second": 31.112, + "eval_steps_per_second": 3.894, + "step": 748 + }, + { + "entropy": 0.5069879525237613, + "epoch": 2.005354752342704, + "grad_norm": 0.21533548831939697, + "learning_rate": 0.00022675737918370628, + "loss": 0.4585062026977539, + "mean_token_accuracy": 0.865652882390552, + "num_tokens": 1925307.0, + "step": 750 + }, + { + "entropy": 0.4457989126443863, + "epoch": 2.139223560910308, + "grad_norm": 0.24705350399017334, + "learning_rate": 0.00022476750117512737, + "loss": 0.4026710891723633, + "mean_token_accuracy": 0.8774338760972022, + "num_tokens": 2055627.0, + "step": 800 + }, + { + "entropy": 0.45084999009966853, + "epoch": 2.2730923694779115, + "grad_norm": 0.26643863320350647, + "learning_rate": 0.00022254280560567822, + "loss": 0.40950340270996094, + "mean_token_accuracy": 0.8752825647592545, + "num_tokens": 2177850.0, + "step": 850 + }, + { + "entropy": 0.4516983331739903, + "epoch": 2.4069611780455156, + "grad_norm": 0.26972198486328125, + "learning_rate": 0.00022008813646608725, + "loss": 0.4115512466430664, + "mean_token_accuracy": 0.8761435833573341, + "num_tokens": 2304612.0, + "step": 900 + }, + { + "entropy": 0.45280722543597224, + "epoch": 2.540829986613119, + "grad_norm": 0.2643600106239319, + "learning_rate": 0.00021740883848518684, + "loss": 0.41181053161621095, + "mean_token_accuracy": 0.8756420350074768, + "num_tokens": 2430946.0, + "step": 950 + }, + { + "entropy": 0.4487862553447485, + "epoch": 2.674698795180723, + "grad_norm": 0.2849285304546356, + "learning_rate": 0.00021451074549244846, + "loss": 0.4094270706176758, + "mean_token_accuracy": 0.8771369129419326, + "num_tokens": 2557241.0, + "step": 1000 + }, + { + "entropy": 0.45000545382499696, + "epoch": 2.8085676037483265, + "grad_norm": 0.24081671237945557, + "learning_rate": 0.0002114001677155633, + "loss": 0.4073855972290039, + "mean_token_accuracy": 0.8779223081469536, + "num_tokens": 2692775.0, + "step": 1050 + }, + { + "entropy": 0.4576124830543995, + "epoch": 2.9424364123159306, + "grad_norm": 0.2341010719537735, + "learning_rate": 0.00020808387804072673, + "loss": 0.4154107666015625, + "mean_token_accuracy": 0.8756425747275353, + "num_tokens": 2823060.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.4812743577361107, + "eval_loss": 0.523208498954773, + "eval_mean_token_accuracy": 0.8494922530651092, + "eval_num_tokens": 2879229.0, + "eval_runtime": 51.3707, + "eval_samples_per_second": 31.107, + "eval_steps_per_second": 3.893, + "step": 1122 + }, + { + "entropy": 0.41342759042075183, + "epoch": 3.074966532797858, + "grad_norm": 0.34829744696617126, + "learning_rate": 0.0002045690972655427, + "loss": 0.3644887542724609, + "mean_token_accuracy": 0.8879408977850519, + "num_tokens": 2955424.0, + "step": 1150 + }, + { + "entropy": 0.3948360003530979, + "epoch": 3.208835341365462, + "grad_norm": 0.312427282333374, + "learning_rate": 0.00020086347837665854, + "loss": 0.34146129608154296, + "mean_token_accuracy": 0.8914449456334114, + "num_tokens": 3078799.0, + "step": 1200 + }, + { + "entropy": 0.38602679744362833, + "epoch": 3.3427041499330654, + "grad_norm": 0.2715625762939453, + "learning_rate": 0.0001969750898863629, + "loss": 0.34105979919433593, + "mean_token_accuracy": 0.8928995525836945, + "num_tokens": 3211945.0, + "step": 1250 + }, + { + "entropy": 0.3923524462431669, + "epoch": 3.4765729585006695, + "grad_norm": 0.27295640110969543, + "learning_rate": 0.00019291239826442992, + "loss": 0.3473458099365234, + "mean_token_accuracy": 0.8913829082250595, + "num_tokens": 3343933.0, + "step": 1300 + }, + { + "entropy": 0.40172914519906044, + "epoch": 3.610441767068273, + "grad_norm": 0.32684192061424255, + "learning_rate": 0.0001886842495034615, + "loss": 0.35543827056884764, + "mean_token_accuracy": 0.8904094022512435, + "num_tokens": 3470087.0, + "step": 1350 + }, + { + "entropy": 0.4010512103140354, + "epoch": 3.7443105756358768, + "grad_norm": 0.23922984302043915, + "learning_rate": 0.00018429984985786734, + "loss": 0.3535212326049805, + "mean_token_accuracy": 0.8891122484207153, + "num_tokens": 3590858.0, + "step": 1400 + }, + { + "entropy": 0.39226082623004915, + "epoch": 3.878179384203481, + "grad_norm": 0.25130993127822876, + "learning_rate": 0.00017976874579842046, + "loss": 0.3484851837158203, + "mean_token_accuracy": 0.8916165816783905, + "num_tokens": 3725806.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.45332007586956025, + "eval_loss": 0.5243425965309143, + "eval_mean_token_accuracy": 0.8532214590907097, + "eval_num_tokens": 3838972.0, + "eval_runtime": 51.3837, + "eval_samples_per_second": 31.099, + "eval_steps_per_second": 3.892, + "step": 1496 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.647043150972508e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/adapter_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8818eeae60314bb5e6cd8bb9d31975488784721f --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.0094403300459725, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "q_proj", + "gate_proj", + "o_proj", + "v_proj", + "k_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/chat_template.jinja b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/tokenizer_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/trainer_state.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1325a869fb61802392682d92bac5b6563c8c2143 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/trainer_state.json @@ -0,0 +1,459 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 1870, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.2357245934009553, + "epoch": 0.13386880856760375, + "grad_norm": 1.1940524578094482, + "learning_rate": 3.0637587087867373e-05, + "loss": 2.0509376525878906, + "mean_token_accuracy": 0.6194104523956776, + "num_tokens": 127704.0, + "step": 50 + }, + { + "entropy": 0.8588631230592728, + "epoch": 0.2677376171352075, + "grad_norm": 0.6621095538139343, + "learning_rate": 6.190043105507899e-05, + "loss": 0.8043325805664062, + "mean_token_accuracy": 0.7855384379625321, + "num_tokens": 256077.0, + "step": 100 + }, + { + "entropy": 0.6726470375061036, + "epoch": 0.40160642570281124, + "grad_norm": 0.5255736112594604, + "learning_rate": 9.316327502229059e-05, + "loss": 0.6415129089355469, + "mean_token_accuracy": 0.8183896672725678, + "num_tokens": 387735.0, + "step": 150 + }, + { + "entropy": 0.6119109469652176, + "epoch": 0.535475234270415, + "grad_norm": 0.45975860953330994, + "learning_rate": 0.0001244261189895022, + "loss": 0.576114501953125, + "mean_token_accuracy": 0.8376823288202285, + "num_tokens": 522202.0, + "step": 200 + }, + { + "entropy": 0.5972725109755993, + "epoch": 0.6693440428380187, + "grad_norm": 0.40055611729621887, + "learning_rate": 0.0001556889629567138, + "loss": 0.5613529205322265, + "mean_token_accuracy": 0.8418879929184914, + "num_tokens": 648663.0, + "step": 250 + }, + { + "entropy": 0.5673307004570961, + "epoch": 0.8032128514056225, + "grad_norm": 0.7427454590797424, + "learning_rate": 0.0001869518069239254, + "loss": 0.5325925827026368, + "mean_token_accuracy": 0.8488189685344696, + "num_tokens": 778245.0, + "step": 300 + }, + { + "entropy": 0.5615521620213986, + "epoch": 0.9370816599732262, + "grad_norm": 0.4017025828361511, + "learning_rate": 0.000218214650891137, + "loss": 0.522266616821289, + "mean_token_accuracy": 0.8500416606664658, + "num_tokens": 905328.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5758187392354012, + "eval_loss": 0.5664522051811218, + "eval_mean_token_accuracy": 0.8383084440231323, + "eval_num_tokens": 959743.0, + "eval_runtime": 51.7964, + "eval_samples_per_second": 30.852, + "eval_steps_per_second": 3.861, + "step": 374 + }, + { + "entropy": 0.5440343178883947, + "epoch": 1.069611780455154, + "grad_norm": 0.3084093928337097, + "learning_rate": 0.00023381424541885068, + "loss": 0.5051319122314453, + "mean_token_accuracy": 0.8554374280602041, + "num_tokens": 1022581.0, + "step": 400 + }, + { + "entropy": 0.5245272906124592, + "epoch": 1.2034805890227578, + "grad_norm": 0.3342743515968323, + "learning_rate": 0.00023355972972676628, + "loss": 0.4855318450927734, + "mean_token_accuracy": 0.8584430786967278, + "num_tokens": 1151430.0, + "step": 450 + }, + { + "entropy": 0.5234513898193837, + "epoch": 1.3373493975903614, + "grad_norm": 0.26953577995300293, + "learning_rate": 0.00023305125251804043, + "loss": 0.4811768341064453, + "mean_token_accuracy": 0.8606387570500373, + "num_tokens": 1282481.0, + "step": 500 + }, + { + "entropy": 0.5147616830468178, + "epoch": 1.4712182061579653, + "grad_norm": 0.28791534900665283, + "learning_rate": 0.0002322899209369128, + "loss": 0.4762062835693359, + "mean_token_accuracy": 0.8612849581241607, + "num_tokens": 1414166.0, + "step": 550 + }, + { + "entropy": 0.5060296922922134, + "epoch": 1.605087014725569, + "grad_norm": 0.2496163249015808, + "learning_rate": 0.0002312773926857543, + "loss": 0.4695176315307617, + "mean_token_accuracy": 0.8636157616972924, + "num_tokens": 1547902.0, + "step": 600 + }, + { + "entropy": 0.5008939932286739, + "epoch": 1.7389558232931726, + "grad_norm": 0.2329542636871338, + "learning_rate": 0.00023001587241563198, + "loss": 0.46317913055419924, + "mean_token_accuracy": 0.8652430367469788, + "num_tokens": 1678635.0, + "step": 650 + }, + { + "entropy": 0.4986082436144352, + "epoch": 1.8728246318607764, + "grad_norm": 0.2459402084350586, + "learning_rate": 0.00022850810692596235, + "loss": 0.4617066192626953, + "mean_token_accuracy": 0.8672981086373329, + "num_tokens": 1803328.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.546445109397173, + "eval_loss": 0.5307288765907288, + "eval_mean_token_accuracy": 0.8434162598848343, + "eval_num_tokens": 1919486.0, + "eval_runtime": 51.3627, + "eval_samples_per_second": 31.112, + "eval_steps_per_second": 3.894, + "step": 748 + }, + { + "entropy": 0.5069879525237613, + "epoch": 2.005354752342704, + "grad_norm": 0.21533548831939697, + "learning_rate": 0.00022675737918370628, + "loss": 0.4585062026977539, + "mean_token_accuracy": 0.865652882390552, + "num_tokens": 1925307.0, + "step": 750 + }, + { + "entropy": 0.4457989126443863, + "epoch": 2.139223560910308, + "grad_norm": 0.24705350399017334, + "learning_rate": 0.00022476750117512737, + "loss": 0.4026710891723633, + "mean_token_accuracy": 0.8774338760972022, + "num_tokens": 2055627.0, + "step": 800 + }, + { + "entropy": 0.45084999009966853, + "epoch": 2.2730923694779115, + "grad_norm": 0.26643863320350647, + "learning_rate": 0.00022254280560567822, + "loss": 0.40950340270996094, + "mean_token_accuracy": 0.8752825647592545, + "num_tokens": 2177850.0, + "step": 850 + }, + { + "entropy": 0.4516983331739903, + "epoch": 2.4069611780455156, + "grad_norm": 0.26972198486328125, + "learning_rate": 0.00022008813646608725, + "loss": 0.4115512466430664, + "mean_token_accuracy": 0.8761435833573341, + "num_tokens": 2304612.0, + "step": 900 + }, + { + "entropy": 0.45280722543597224, + "epoch": 2.540829986613119, + "grad_norm": 0.2643600106239319, + "learning_rate": 0.00021740883848518684, + "loss": 0.41181053161621095, + "mean_token_accuracy": 0.8756420350074768, + "num_tokens": 2430946.0, + "step": 950 + }, + { + "entropy": 0.4487862553447485, + "epoch": 2.674698795180723, + "grad_norm": 0.2849285304546356, + "learning_rate": 0.00021451074549244846, + "loss": 0.4094270706176758, + "mean_token_accuracy": 0.8771369129419326, + "num_tokens": 2557241.0, + "step": 1000 + }, + { + "entropy": 0.45000545382499696, + "epoch": 2.8085676037483265, + "grad_norm": 0.24081671237945557, + "learning_rate": 0.0002114001677155633, + "loss": 0.4073855972290039, + "mean_token_accuracy": 0.8779223081469536, + "num_tokens": 2692775.0, + "step": 1050 + }, + { + "entropy": 0.4576124830543995, + "epoch": 2.9424364123159306, + "grad_norm": 0.2341010719537735, + "learning_rate": 0.00020808387804072673, + "loss": 0.4154107666015625, + "mean_token_accuracy": 0.8756425747275353, + "num_tokens": 2823060.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.4812743577361107, + "eval_loss": 0.523208498954773, + "eval_mean_token_accuracy": 0.8494922530651092, + "eval_num_tokens": 2879229.0, + "eval_runtime": 51.3707, + "eval_samples_per_second": 31.107, + "eval_steps_per_second": 3.893, + "step": 1122 + }, + { + "entropy": 0.41342759042075183, + "epoch": 3.074966532797858, + "grad_norm": 0.34829744696617126, + "learning_rate": 0.0002045690972655427, + "loss": 0.3644887542724609, + "mean_token_accuracy": 0.8879408977850519, + "num_tokens": 2955424.0, + "step": 1150 + }, + { + "entropy": 0.3948360003530979, + "epoch": 3.208835341365462, + "grad_norm": 0.312427282333374, + "learning_rate": 0.00020086347837665854, + "loss": 0.34146129608154296, + "mean_token_accuracy": 0.8914449456334114, + "num_tokens": 3078799.0, + "step": 1200 + }, + { + "entropy": 0.38602679744362833, + "epoch": 3.3427041499330654, + "grad_norm": 0.2715625762939453, + "learning_rate": 0.0001969750898863629, + "loss": 0.34105979919433593, + "mean_token_accuracy": 0.8928995525836945, + "num_tokens": 3211945.0, + "step": 1250 + }, + { + "entropy": 0.3923524462431669, + "epoch": 3.4765729585006695, + "grad_norm": 0.27295640110969543, + "learning_rate": 0.00019291239826442992, + "loss": 0.3473458099365234, + "mean_token_accuracy": 0.8913829082250595, + "num_tokens": 3343933.0, + "step": 1300 + }, + { + "entropy": 0.40172914519906044, + "epoch": 3.610441767068273, + "grad_norm": 0.32684192061424255, + "learning_rate": 0.0001886842495034615, + "loss": 0.35543827056884764, + "mean_token_accuracy": 0.8904094022512435, + "num_tokens": 3470087.0, + "step": 1350 + }, + { + "entropy": 0.4010512103140354, + "epoch": 3.7443105756358768, + "grad_norm": 0.23922984302043915, + "learning_rate": 0.00018429984985786734, + "loss": 0.3535212326049805, + "mean_token_accuracy": 0.8891122484207153, + "num_tokens": 3590858.0, + "step": 1400 + }, + { + "entropy": 0.39226082623004915, + "epoch": 3.878179384203481, + "grad_norm": 0.25130993127822876, + "learning_rate": 0.00017976874579842046, + "loss": 0.3484851837158203, + "mean_token_accuracy": 0.8916165816783905, + "num_tokens": 3725806.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.45332007586956025, + "eval_loss": 0.5243425965309143, + "eval_mean_token_accuracy": 0.8532214590907097, + "eval_num_tokens": 3838972.0, + "eval_runtime": 51.3837, + "eval_samples_per_second": 31.099, + "eval_steps_per_second": 3.892, + "step": 1496 + }, + { + "entropy": 0.39358587043754983, + "epoch": 4.010709504685408, + "grad_norm": 0.27960067987442017, + "learning_rate": 0.0001751008032260355, + "loss": 0.34616813659667967, + "mean_token_accuracy": 0.8923709010234987, + "num_tokens": 3849380.0, + "step": 1500 + }, + { + "entropy": 0.3206369188427925, + "epoch": 4.144578313253012, + "grad_norm": 0.37261858582496643, + "learning_rate": 0.00017030618599002818, + "loss": 0.2684581565856934, + "mean_token_accuracy": 0.9131761506199837, + "num_tokens": 3976694.0, + "step": 1550 + }, + { + "entropy": 0.3254615054279566, + "epoch": 4.278447121820616, + "grad_norm": 0.39803799986839294, + "learning_rate": 0.00016539533375763032, + "loss": 0.2769618606567383, + "mean_token_accuracy": 0.9102409112453461, + "num_tokens": 4110204.0, + "step": 1600 + }, + { + "entropy": 0.32003962114453316, + "epoch": 4.412315930388219, + "grad_norm": 0.35707366466522217, + "learning_rate": 0.0001603789392829468, + "loss": 0.2749842834472656, + "mean_token_accuracy": 0.910626070201397, + "num_tokens": 4240883.0, + "step": 1650 + }, + { + "entropy": 0.32672152675688265, + "epoch": 4.546184738955823, + "grad_norm": 0.47052621841430664, + "learning_rate": 0.00015526792512484774, + "loss": 0.27983531951904295, + "mean_token_accuracy": 0.9093958771228791, + "num_tokens": 4365381.0, + "step": 1700 + }, + { + "entropy": 0.33449163861572745, + "epoch": 4.680053547523427, + "grad_norm": 0.330709844827652, + "learning_rate": 0.00015007341986449012, + "loss": 0.28533639907836916, + "mean_token_accuracy": 0.9082232251763344, + "num_tokens": 4490711.0, + "step": 1750 + }, + { + "entropy": 0.33353066638112067, + "epoch": 4.813922356091031, + "grad_norm": 0.3990134298801422, + "learning_rate": 0.00014480673387425272, + "loss": 0.28489078521728517, + "mean_token_accuracy": 0.908001911342144, + "num_tokens": 4618532.0, + "step": 1800 + }, + { + "entropy": 0.3272412090748549, + "epoch": 4.947791164658635, + "grad_norm": 0.3183020353317261, + "learning_rate": 0.00013947933469084315, + "loss": 0.2772365379333496, + "mean_token_accuracy": 0.908946952521801, + "num_tokens": 4752261.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.3932068006694317, + "eval_loss": 0.5647156834602356, + "eval_mean_token_accuracy": 0.85078628718853, + "eval_num_tokens": 4798715.0, + "eval_runtime": 51.3578, + "eval_samples_per_second": 31.115, + "eval_steps_per_second": 3.894, + "step": 1870 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.0568507616161485e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/adapter_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8818eeae60314bb5e6cd8bb9d31975488784721f --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.0094403300459725, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "q_proj", + "gate_proj", + "o_proj", + "v_proj", + "k_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/chat_template.jinja b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/tokenizer_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/trainer_state.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..838942db2ba3089518ff6c523ad22f7c0150b924 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/trainer_state.json @@ -0,0 +1,540 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.0, + "eval_steps": 500, + "global_step": 2244, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.2357245934009553, + "epoch": 0.13386880856760375, + "grad_norm": 1.1940524578094482, + "learning_rate": 3.0637587087867373e-05, + "loss": 2.0509376525878906, + "mean_token_accuracy": 0.6194104523956776, + "num_tokens": 127704.0, + "step": 50 + }, + { + "entropy": 0.8588631230592728, + "epoch": 0.2677376171352075, + "grad_norm": 0.6621095538139343, + "learning_rate": 6.190043105507899e-05, + "loss": 0.8043325805664062, + "mean_token_accuracy": 0.7855384379625321, + "num_tokens": 256077.0, + "step": 100 + }, + { + "entropy": 0.6726470375061036, + "epoch": 0.40160642570281124, + "grad_norm": 0.5255736112594604, + "learning_rate": 9.316327502229059e-05, + "loss": 0.6415129089355469, + "mean_token_accuracy": 0.8183896672725678, + "num_tokens": 387735.0, + "step": 150 + }, + { + "entropy": 0.6119109469652176, + "epoch": 0.535475234270415, + "grad_norm": 0.45975860953330994, + "learning_rate": 0.0001244261189895022, + "loss": 0.576114501953125, + "mean_token_accuracy": 0.8376823288202285, + "num_tokens": 522202.0, + "step": 200 + }, + { + "entropy": 0.5972725109755993, + "epoch": 0.6693440428380187, + "grad_norm": 0.40055611729621887, + "learning_rate": 0.0001556889629567138, + "loss": 0.5613529205322265, + "mean_token_accuracy": 0.8418879929184914, + "num_tokens": 648663.0, + "step": 250 + }, + { + "entropy": 0.5673307004570961, + "epoch": 0.8032128514056225, + "grad_norm": 0.7427454590797424, + "learning_rate": 0.0001869518069239254, + "loss": 0.5325925827026368, + "mean_token_accuracy": 0.8488189685344696, + "num_tokens": 778245.0, + "step": 300 + }, + { + "entropy": 0.5615521620213986, + "epoch": 0.9370816599732262, + "grad_norm": 0.4017025828361511, + "learning_rate": 0.000218214650891137, + "loss": 0.522266616821289, + "mean_token_accuracy": 0.8500416606664658, + "num_tokens": 905328.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5758187392354012, + "eval_loss": 0.5664522051811218, + "eval_mean_token_accuracy": 0.8383084440231323, + "eval_num_tokens": 959743.0, + "eval_runtime": 51.7964, + "eval_samples_per_second": 30.852, + "eval_steps_per_second": 3.861, + "step": 374 + }, + { + "entropy": 0.5440343178883947, + "epoch": 1.069611780455154, + "grad_norm": 0.3084093928337097, + "learning_rate": 0.00023381424541885068, + "loss": 0.5051319122314453, + "mean_token_accuracy": 0.8554374280602041, + "num_tokens": 1022581.0, + "step": 400 + }, + { + "entropy": 0.5245272906124592, + "epoch": 1.2034805890227578, + "grad_norm": 0.3342743515968323, + "learning_rate": 0.00023355972972676628, + "loss": 0.4855318450927734, + "mean_token_accuracy": 0.8584430786967278, + "num_tokens": 1151430.0, + "step": 450 + }, + { + "entropy": 0.5234513898193837, + "epoch": 1.3373493975903614, + "grad_norm": 0.26953577995300293, + "learning_rate": 0.00023305125251804043, + "loss": 0.4811768341064453, + "mean_token_accuracy": 0.8606387570500373, + "num_tokens": 1282481.0, + "step": 500 + }, + { + "entropy": 0.5147616830468178, + "epoch": 1.4712182061579653, + "grad_norm": 0.28791534900665283, + "learning_rate": 0.0002322899209369128, + "loss": 0.4762062835693359, + "mean_token_accuracy": 0.8612849581241607, + "num_tokens": 1414166.0, + "step": 550 + }, + { + "entropy": 0.5060296922922134, + "epoch": 1.605087014725569, + "grad_norm": 0.2496163249015808, + "learning_rate": 0.0002312773926857543, + "loss": 0.4695176315307617, + "mean_token_accuracy": 0.8636157616972924, + "num_tokens": 1547902.0, + "step": 600 + }, + { + "entropy": 0.5008939932286739, + "epoch": 1.7389558232931726, + "grad_norm": 0.2329542636871338, + "learning_rate": 0.00023001587241563198, + "loss": 0.46317913055419924, + "mean_token_accuracy": 0.8652430367469788, + "num_tokens": 1678635.0, + "step": 650 + }, + { + "entropy": 0.4986082436144352, + "epoch": 1.8728246318607764, + "grad_norm": 0.2459402084350586, + "learning_rate": 0.00022850810692596235, + "loss": 0.4617066192626953, + "mean_token_accuracy": 0.8672981086373329, + "num_tokens": 1803328.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.546445109397173, + "eval_loss": 0.5307288765907288, + "eval_mean_token_accuracy": 0.8434162598848343, + "eval_num_tokens": 1919486.0, + "eval_runtime": 51.3627, + "eval_samples_per_second": 31.112, + "eval_steps_per_second": 3.894, + "step": 748 + }, + { + "entropy": 0.5069879525237613, + "epoch": 2.005354752342704, + "grad_norm": 0.21533548831939697, + "learning_rate": 0.00022675737918370628, + "loss": 0.4585062026977539, + "mean_token_accuracy": 0.865652882390552, + "num_tokens": 1925307.0, + "step": 750 + }, + { + "entropy": 0.4457989126443863, + "epoch": 2.139223560910308, + "grad_norm": 0.24705350399017334, + "learning_rate": 0.00022476750117512737, + "loss": 0.4026710891723633, + "mean_token_accuracy": 0.8774338760972022, + "num_tokens": 2055627.0, + "step": 800 + }, + { + "entropy": 0.45084999009966853, + "epoch": 2.2730923694779115, + "grad_norm": 0.26643863320350647, + "learning_rate": 0.00022254280560567822, + "loss": 0.40950340270996094, + "mean_token_accuracy": 0.8752825647592545, + "num_tokens": 2177850.0, + "step": 850 + }, + { + "entropy": 0.4516983331739903, + "epoch": 2.4069611780455156, + "grad_norm": 0.26972198486328125, + "learning_rate": 0.00022008813646608725, + "loss": 0.4115512466430664, + "mean_token_accuracy": 0.8761435833573341, + "num_tokens": 2304612.0, + "step": 900 + }, + { + "entropy": 0.45280722543597224, + "epoch": 2.540829986613119, + "grad_norm": 0.2643600106239319, + "learning_rate": 0.00021740883848518684, + "loss": 0.41181053161621095, + "mean_token_accuracy": 0.8756420350074768, + "num_tokens": 2430946.0, + "step": 950 + }, + { + "entropy": 0.4487862553447485, + "epoch": 2.674698795180723, + "grad_norm": 0.2849285304546356, + "learning_rate": 0.00021451074549244846, + "loss": 0.4094270706176758, + "mean_token_accuracy": 0.8771369129419326, + "num_tokens": 2557241.0, + "step": 1000 + }, + { + "entropy": 0.45000545382499696, + "epoch": 2.8085676037483265, + "grad_norm": 0.24081671237945557, + "learning_rate": 0.0002114001677155633, + "loss": 0.4073855972290039, + "mean_token_accuracy": 0.8779223081469536, + "num_tokens": 2692775.0, + "step": 1050 + }, + { + "entropy": 0.4576124830543995, + "epoch": 2.9424364123159306, + "grad_norm": 0.2341010719537735, + "learning_rate": 0.00020808387804072673, + "loss": 0.4154107666015625, + "mean_token_accuracy": 0.8756425747275353, + "num_tokens": 2823060.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.4812743577361107, + "eval_loss": 0.523208498954773, + "eval_mean_token_accuracy": 0.8494922530651092, + "eval_num_tokens": 2879229.0, + "eval_runtime": 51.3707, + "eval_samples_per_second": 31.107, + "eval_steps_per_second": 3.893, + "step": 1122 + }, + { + "entropy": 0.41342759042075183, + "epoch": 3.074966532797858, + "grad_norm": 0.34829744696617126, + "learning_rate": 0.0002045690972655427, + "loss": 0.3644887542724609, + "mean_token_accuracy": 0.8879408977850519, + "num_tokens": 2955424.0, + "step": 1150 + }, + { + "entropy": 0.3948360003530979, + "epoch": 3.208835341365462, + "grad_norm": 0.312427282333374, + "learning_rate": 0.00020086347837665854, + "loss": 0.34146129608154296, + "mean_token_accuracy": 0.8914449456334114, + "num_tokens": 3078799.0, + "step": 1200 + }, + { + "entropy": 0.38602679744362833, + "epoch": 3.3427041499330654, + "grad_norm": 0.2715625762939453, + "learning_rate": 0.0001969750898863629, + "loss": 0.34105979919433593, + "mean_token_accuracy": 0.8928995525836945, + "num_tokens": 3211945.0, + "step": 1250 + }, + { + "entropy": 0.3923524462431669, + "epoch": 3.4765729585006695, + "grad_norm": 0.27295640110969543, + "learning_rate": 0.00019291239826442992, + "loss": 0.3473458099365234, + "mean_token_accuracy": 0.8913829082250595, + "num_tokens": 3343933.0, + "step": 1300 + }, + { + "entropy": 0.40172914519906044, + "epoch": 3.610441767068273, + "grad_norm": 0.32684192061424255, + "learning_rate": 0.0001886842495034615, + "loss": 0.35543827056884764, + "mean_token_accuracy": 0.8904094022512435, + "num_tokens": 3470087.0, + "step": 1350 + }, + { + "entropy": 0.4010512103140354, + "epoch": 3.7443105756358768, + "grad_norm": 0.23922984302043915, + "learning_rate": 0.00018429984985786734, + "loss": 0.3535212326049805, + "mean_token_accuracy": 0.8891122484207153, + "num_tokens": 3590858.0, + "step": 1400 + }, + { + "entropy": 0.39226082623004915, + "epoch": 3.878179384203481, + "grad_norm": 0.25130993127822876, + "learning_rate": 0.00017976874579842046, + "loss": 0.3484851837158203, + "mean_token_accuracy": 0.8916165816783905, + "num_tokens": 3725806.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.45332007586956025, + "eval_loss": 0.5243425965309143, + "eval_mean_token_accuracy": 0.8532214590907097, + "eval_num_tokens": 3838972.0, + "eval_runtime": 51.3837, + "eval_samples_per_second": 31.099, + "eval_steps_per_second": 3.892, + "step": 1496 + }, + { + "entropy": 0.39358587043754983, + "epoch": 4.010709504685408, + "grad_norm": 0.27960067987442017, + "learning_rate": 0.0001751008032260355, + "loss": 0.34616813659667967, + "mean_token_accuracy": 0.8923709010234987, + "num_tokens": 3849380.0, + "step": 1500 + }, + { + "entropy": 0.3206369188427925, + "epoch": 4.144578313253012, + "grad_norm": 0.37261858582496643, + "learning_rate": 0.00017030618599002818, + "loss": 0.2684581565856934, + "mean_token_accuracy": 0.9131761506199837, + "num_tokens": 3976694.0, + "step": 1550 + }, + { + "entropy": 0.3254615054279566, + "epoch": 4.278447121820616, + "grad_norm": 0.39803799986839294, + "learning_rate": 0.00016539533375763032, + "loss": 0.2769618606567383, + "mean_token_accuracy": 0.9102409112453461, + "num_tokens": 4110204.0, + "step": 1600 + }, + { + "entropy": 0.32003962114453316, + "epoch": 4.412315930388219, + "grad_norm": 0.35707366466522217, + "learning_rate": 0.0001603789392829468, + "loss": 0.2749842834472656, + "mean_token_accuracy": 0.910626070201397, + "num_tokens": 4240883.0, + "step": 1650 + }, + { + "entropy": 0.32672152675688265, + "epoch": 4.546184738955823, + "grad_norm": 0.47052621841430664, + "learning_rate": 0.00015526792512484774, + "loss": 0.27983531951904295, + "mean_token_accuracy": 0.9093958771228791, + "num_tokens": 4365381.0, + "step": 1700 + }, + { + "entropy": 0.33449163861572745, + "epoch": 4.680053547523427, + "grad_norm": 0.330709844827652, + "learning_rate": 0.00015007341986449012, + "loss": 0.28533639907836916, + "mean_token_accuracy": 0.9082232251763344, + "num_tokens": 4490711.0, + "step": 1750 + }, + { + "entropy": 0.33353066638112067, + "epoch": 4.813922356091031, + "grad_norm": 0.3990134298801422, + "learning_rate": 0.00014480673387425272, + "loss": 0.28489078521728517, + "mean_token_accuracy": 0.908001911342144, + "num_tokens": 4618532.0, + "step": 1800 + }, + { + "entropy": 0.3272412090748549, + "epoch": 4.947791164658635, + "grad_norm": 0.3183020353317261, + "learning_rate": 0.00013947933469084315, + "loss": 0.2772365379333496, + "mean_token_accuracy": 0.908946952521801, + "num_tokens": 4752261.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.3932068006694317, + "eval_loss": 0.5647156834602356, + "eval_mean_token_accuracy": 0.85078628718853, + "eval_num_tokens": 4798715.0, + "eval_runtime": 51.3578, + "eval_samples_per_second": 31.115, + "eval_steps_per_second": 3.894, + "step": 1870 + }, + { + "entropy": 0.2832252390005372, + "epoch": 5.080321285140562, + "grad_norm": 0.3697633147239685, + "learning_rate": 0.00013410282204620014, + "loss": 0.2279021453857422, + "mean_token_accuracy": 0.9252248072262966, + "num_tokens": 4879271.0, + "step": 1900 + }, + { + "entropy": 0.250804705247283, + "epoch": 5.214190093708166, + "grad_norm": 0.3890162706375122, + "learning_rate": 0.00012868890261055722, + "loss": 0.1980854606628418, + "mean_token_accuracy": 0.9338876655697823, + "num_tokens": 5005076.0, + "step": 1950 + }, + { + "entropy": 0.2531572911888361, + "epoch": 5.34805890227577, + "grad_norm": 0.43466225266456604, + "learning_rate": 0.0001232493645026623, + "loss": 0.20114482879638673, + "mean_token_accuracy": 0.9317018255591393, + "num_tokens": 5133591.0, + "step": 2000 + }, + { + "entropy": 0.25918263107538225, + "epoch": 5.481927710843373, + "grad_norm": 0.38253673911094666, + "learning_rate": 0.00011779605162265297, + "loss": 0.2056061363220215, + "mean_token_accuracy": 0.9302830925583839, + "num_tokens": 5257252.0, + "step": 2050 + }, + { + "entropy": 0.2553627458959818, + "epoch": 5.615796519410977, + "grad_norm": 0.4536231458187103, + "learning_rate": 0.00011234083786347563, + "loss": 0.20531394958496094, + "mean_token_accuracy": 0.9302299374341965, + "num_tokens": 5388652.0, + "step": 2100 + }, + { + "entropy": 0.2575570110231638, + "epoch": 5.749665327978581, + "grad_norm": 0.36399731040000916, + "learning_rate": 0.00010689560125699833, + "loss": 0.2048162841796875, + "mean_token_accuracy": 0.9306997761130333, + "num_tokens": 5515488.0, + "step": 2150 + }, + { + "entropy": 0.24660897620022296, + "epoch": 5.883534136546185, + "grad_norm": 0.43602702021598816, + "learning_rate": 0.00010147219811111233, + "loss": 0.1986431884765625, + "mean_token_accuracy": 0.9335029146075249, + "num_tokens": 5644323.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.32280903935432437, + "eval_loss": 0.6383674144744873, + "eval_mean_token_accuracy": 0.849581449329853, + "eval_num_tokens": 5758458.0, + "eval_runtime": 51.3251, + "eval_samples_per_second": 31.135, + "eval_steps_per_second": 3.897, + "step": 2244 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.465057375940987e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/adapter_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8818eeae60314bb5e6cd8bb9d31975488784721f --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.0094403300459725, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "q_proj", + "gate_proj", + "o_proj", + "v_proj", + "k_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/chat_template.jinja b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/tokenizer_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/trainer_state.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..98c363caf2498eca36506878bf632fd3e3a40abe --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/trainer_state.json @@ -0,0 +1,631 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.0, + "eval_steps": 500, + "global_step": 2618, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.2357245934009553, + "epoch": 0.13386880856760375, + "grad_norm": 1.1940524578094482, + "learning_rate": 3.0637587087867373e-05, + "loss": 2.0509376525878906, + "mean_token_accuracy": 0.6194104523956776, + "num_tokens": 127704.0, + "step": 50 + }, + { + "entropy": 0.8588631230592728, + "epoch": 0.2677376171352075, + "grad_norm": 0.6621095538139343, + "learning_rate": 6.190043105507899e-05, + "loss": 0.8043325805664062, + "mean_token_accuracy": 0.7855384379625321, + "num_tokens": 256077.0, + "step": 100 + }, + { + "entropy": 0.6726470375061036, + "epoch": 0.40160642570281124, + "grad_norm": 0.5255736112594604, + "learning_rate": 9.316327502229059e-05, + "loss": 0.6415129089355469, + "mean_token_accuracy": 0.8183896672725678, + "num_tokens": 387735.0, + "step": 150 + }, + { + "entropy": 0.6119109469652176, + "epoch": 0.535475234270415, + "grad_norm": 0.45975860953330994, + "learning_rate": 0.0001244261189895022, + "loss": 0.576114501953125, + "mean_token_accuracy": 0.8376823288202285, + "num_tokens": 522202.0, + "step": 200 + }, + { + "entropy": 0.5972725109755993, + "epoch": 0.6693440428380187, + "grad_norm": 0.40055611729621887, + "learning_rate": 0.0001556889629567138, + "loss": 0.5613529205322265, + "mean_token_accuracy": 0.8418879929184914, + "num_tokens": 648663.0, + "step": 250 + }, + { + "entropy": 0.5673307004570961, + "epoch": 0.8032128514056225, + "grad_norm": 0.7427454590797424, + "learning_rate": 0.0001869518069239254, + "loss": 0.5325925827026368, + "mean_token_accuracy": 0.8488189685344696, + "num_tokens": 778245.0, + "step": 300 + }, + { + "entropy": 0.5615521620213986, + "epoch": 0.9370816599732262, + "grad_norm": 0.4017025828361511, + "learning_rate": 0.000218214650891137, + "loss": 0.522266616821289, + "mean_token_accuracy": 0.8500416606664658, + "num_tokens": 905328.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5758187392354012, + "eval_loss": 0.5664522051811218, + "eval_mean_token_accuracy": 0.8383084440231323, + "eval_num_tokens": 959743.0, + "eval_runtime": 51.7964, + "eval_samples_per_second": 30.852, + "eval_steps_per_second": 3.861, + "step": 374 + }, + { + "entropy": 0.5440343178883947, + "epoch": 1.069611780455154, + "grad_norm": 0.3084093928337097, + "learning_rate": 0.00023381424541885068, + "loss": 0.5051319122314453, + "mean_token_accuracy": 0.8554374280602041, + "num_tokens": 1022581.0, + "step": 400 + }, + { + "entropy": 0.5245272906124592, + "epoch": 1.2034805890227578, + "grad_norm": 0.3342743515968323, + "learning_rate": 0.00023355972972676628, + "loss": 0.4855318450927734, + "mean_token_accuracy": 0.8584430786967278, + "num_tokens": 1151430.0, + "step": 450 + }, + { + "entropy": 0.5234513898193837, + "epoch": 1.3373493975903614, + "grad_norm": 0.26953577995300293, + "learning_rate": 0.00023305125251804043, + "loss": 0.4811768341064453, + "mean_token_accuracy": 0.8606387570500373, + "num_tokens": 1282481.0, + "step": 500 + }, + { + "entropy": 0.5147616830468178, + "epoch": 1.4712182061579653, + "grad_norm": 0.28791534900665283, + "learning_rate": 0.0002322899209369128, + "loss": 0.4762062835693359, + "mean_token_accuracy": 0.8612849581241607, + "num_tokens": 1414166.0, + "step": 550 + }, + { + "entropy": 0.5060296922922134, + "epoch": 1.605087014725569, + "grad_norm": 0.2496163249015808, + "learning_rate": 0.0002312773926857543, + "loss": 0.4695176315307617, + "mean_token_accuracy": 0.8636157616972924, + "num_tokens": 1547902.0, + "step": 600 + }, + { + "entropy": 0.5008939932286739, + "epoch": 1.7389558232931726, + "grad_norm": 0.2329542636871338, + "learning_rate": 0.00023001587241563198, + "loss": 0.46317913055419924, + "mean_token_accuracy": 0.8652430367469788, + "num_tokens": 1678635.0, + "step": 650 + }, + { + "entropy": 0.4986082436144352, + "epoch": 1.8728246318607764, + "grad_norm": 0.2459402084350586, + "learning_rate": 0.00022850810692596235, + "loss": 0.4617066192626953, + "mean_token_accuracy": 0.8672981086373329, + "num_tokens": 1803328.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.546445109397173, + "eval_loss": 0.5307288765907288, + "eval_mean_token_accuracy": 0.8434162598848343, + "eval_num_tokens": 1919486.0, + "eval_runtime": 51.3627, + "eval_samples_per_second": 31.112, + "eval_steps_per_second": 3.894, + "step": 748 + }, + { + "entropy": 0.5069879525237613, + "epoch": 2.005354752342704, + "grad_norm": 0.21533548831939697, + "learning_rate": 0.00022675737918370628, + "loss": 0.4585062026977539, + "mean_token_accuracy": 0.865652882390552, + "num_tokens": 1925307.0, + "step": 750 + }, + { + "entropy": 0.4457989126443863, + "epoch": 2.139223560910308, + "grad_norm": 0.24705350399017334, + "learning_rate": 0.00022476750117512737, + "loss": 0.4026710891723633, + "mean_token_accuracy": 0.8774338760972022, + "num_tokens": 2055627.0, + "step": 800 + }, + { + "entropy": 0.45084999009966853, + "epoch": 2.2730923694779115, + "grad_norm": 0.26643863320350647, + "learning_rate": 0.00022254280560567822, + "loss": 0.40950340270996094, + "mean_token_accuracy": 0.8752825647592545, + "num_tokens": 2177850.0, + "step": 850 + }, + { + "entropy": 0.4516983331739903, + "epoch": 2.4069611780455156, + "grad_norm": 0.26972198486328125, + "learning_rate": 0.00022008813646608725, + "loss": 0.4115512466430664, + "mean_token_accuracy": 0.8761435833573341, + "num_tokens": 2304612.0, + "step": 900 + }, + { + "entropy": 0.45280722543597224, + "epoch": 2.540829986613119, + "grad_norm": 0.2643600106239319, + "learning_rate": 0.00021740883848518684, + "loss": 0.41181053161621095, + "mean_token_accuracy": 0.8756420350074768, + "num_tokens": 2430946.0, + "step": 950 + }, + { + "entropy": 0.4487862553447485, + "epoch": 2.674698795180723, + "grad_norm": 0.2849285304546356, + "learning_rate": 0.00021451074549244846, + "loss": 0.4094270706176758, + "mean_token_accuracy": 0.8771369129419326, + "num_tokens": 2557241.0, + "step": 1000 + }, + { + "entropy": 0.45000545382499696, + "epoch": 2.8085676037483265, + "grad_norm": 0.24081671237945557, + "learning_rate": 0.0002114001677155633, + "loss": 0.4073855972290039, + "mean_token_accuracy": 0.8779223081469536, + "num_tokens": 2692775.0, + "step": 1050 + }, + { + "entropy": 0.4576124830543995, + "epoch": 2.9424364123159306, + "grad_norm": 0.2341010719537735, + "learning_rate": 0.00020808387804072673, + "loss": 0.4154107666015625, + "mean_token_accuracy": 0.8756425747275353, + "num_tokens": 2823060.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.4812743577361107, + "eval_loss": 0.523208498954773, + "eval_mean_token_accuracy": 0.8494922530651092, + "eval_num_tokens": 2879229.0, + "eval_runtime": 51.3707, + "eval_samples_per_second": 31.107, + "eval_steps_per_second": 3.893, + "step": 1122 + }, + { + "entropy": 0.41342759042075183, + "epoch": 3.074966532797858, + "grad_norm": 0.34829744696617126, + "learning_rate": 0.0002045690972655427, + "loss": 0.3644887542724609, + "mean_token_accuracy": 0.8879408977850519, + "num_tokens": 2955424.0, + "step": 1150 + }, + { + "entropy": 0.3948360003530979, + "epoch": 3.208835341365462, + "grad_norm": 0.312427282333374, + "learning_rate": 0.00020086347837665854, + "loss": 0.34146129608154296, + "mean_token_accuracy": 0.8914449456334114, + "num_tokens": 3078799.0, + "step": 1200 + }, + { + "entropy": 0.38602679744362833, + "epoch": 3.3427041499330654, + "grad_norm": 0.2715625762939453, + "learning_rate": 0.0001969750898863629, + "loss": 0.34105979919433593, + "mean_token_accuracy": 0.8928995525836945, + "num_tokens": 3211945.0, + "step": 1250 + }, + { + "entropy": 0.3923524462431669, + "epoch": 3.4765729585006695, + "grad_norm": 0.27295640110969543, + "learning_rate": 0.00019291239826442992, + "loss": 0.3473458099365234, + "mean_token_accuracy": 0.8913829082250595, + "num_tokens": 3343933.0, + "step": 1300 + }, + { + "entropy": 0.40172914519906044, + "epoch": 3.610441767068273, + "grad_norm": 0.32684192061424255, + "learning_rate": 0.0001886842495034615, + "loss": 0.35543827056884764, + "mean_token_accuracy": 0.8904094022512435, + "num_tokens": 3470087.0, + "step": 1350 + }, + { + "entropy": 0.4010512103140354, + "epoch": 3.7443105756358768, + "grad_norm": 0.23922984302043915, + "learning_rate": 0.00018429984985786734, + "loss": 0.3535212326049805, + "mean_token_accuracy": 0.8891122484207153, + "num_tokens": 3590858.0, + "step": 1400 + }, + { + "entropy": 0.39226082623004915, + "epoch": 3.878179384203481, + "grad_norm": 0.25130993127822876, + "learning_rate": 0.00017976874579842046, + "loss": 0.3484851837158203, + "mean_token_accuracy": 0.8916165816783905, + "num_tokens": 3725806.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.45332007586956025, + "eval_loss": 0.5243425965309143, + "eval_mean_token_accuracy": 0.8532214590907097, + "eval_num_tokens": 3838972.0, + "eval_runtime": 51.3837, + "eval_samples_per_second": 31.099, + "eval_steps_per_second": 3.892, + "step": 1496 + }, + { + "entropy": 0.39358587043754983, + "epoch": 4.010709504685408, + "grad_norm": 0.27960067987442017, + "learning_rate": 0.0001751008032260355, + "loss": 0.34616813659667967, + "mean_token_accuracy": 0.8923709010234987, + "num_tokens": 3849380.0, + "step": 1500 + }, + { + "entropy": 0.3206369188427925, + "epoch": 4.144578313253012, + "grad_norm": 0.37261858582496643, + "learning_rate": 0.00017030618599002818, + "loss": 0.2684581565856934, + "mean_token_accuracy": 0.9131761506199837, + "num_tokens": 3976694.0, + "step": 1550 + }, + { + "entropy": 0.3254615054279566, + "epoch": 4.278447121820616, + "grad_norm": 0.39803799986839294, + "learning_rate": 0.00016539533375763032, + "loss": 0.2769618606567383, + "mean_token_accuracy": 0.9102409112453461, + "num_tokens": 4110204.0, + "step": 1600 + }, + { + "entropy": 0.32003962114453316, + "epoch": 4.412315930388219, + "grad_norm": 0.35707366466522217, + "learning_rate": 0.0001603789392829468, + "loss": 0.2749842834472656, + "mean_token_accuracy": 0.910626070201397, + "num_tokens": 4240883.0, + "step": 1650 + }, + { + "entropy": 0.32672152675688265, + "epoch": 4.546184738955823, + "grad_norm": 0.47052621841430664, + "learning_rate": 0.00015526792512484774, + "loss": 0.27983531951904295, + "mean_token_accuracy": 0.9093958771228791, + "num_tokens": 4365381.0, + "step": 1700 + }, + { + "entropy": 0.33449163861572745, + "epoch": 4.680053547523427, + "grad_norm": 0.330709844827652, + "learning_rate": 0.00015007341986449012, + "loss": 0.28533639907836916, + "mean_token_accuracy": 0.9082232251763344, + "num_tokens": 4490711.0, + "step": 1750 + }, + { + "entropy": 0.33353066638112067, + "epoch": 4.813922356091031, + "grad_norm": 0.3990134298801422, + "learning_rate": 0.00014480673387425272, + "loss": 0.28489078521728517, + "mean_token_accuracy": 0.908001911342144, + "num_tokens": 4618532.0, + "step": 1800 + }, + { + "entropy": 0.3272412090748549, + "epoch": 4.947791164658635, + "grad_norm": 0.3183020353317261, + "learning_rate": 0.00013947933469084315, + "loss": 0.2772365379333496, + "mean_token_accuracy": 0.908946952521801, + "num_tokens": 4752261.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.3932068006694317, + "eval_loss": 0.5647156834602356, + "eval_mean_token_accuracy": 0.85078628718853, + "eval_num_tokens": 4798715.0, + "eval_runtime": 51.3578, + "eval_samples_per_second": 31.115, + "eval_steps_per_second": 3.894, + "step": 1870 + }, + { + "entropy": 0.2832252390005372, + "epoch": 5.080321285140562, + "grad_norm": 0.3697633147239685, + "learning_rate": 0.00013410282204620014, + "loss": 0.2279021453857422, + "mean_token_accuracy": 0.9252248072262966, + "num_tokens": 4879271.0, + "step": 1900 + }, + { + "entropy": 0.250804705247283, + "epoch": 5.214190093708166, + "grad_norm": 0.3890162706375122, + "learning_rate": 0.00012868890261055722, + "loss": 0.1980854606628418, + "mean_token_accuracy": 0.9338876655697823, + "num_tokens": 5005076.0, + "step": 1950 + }, + { + "entropy": 0.2531572911888361, + "epoch": 5.34805890227577, + "grad_norm": 0.43466225266456604, + "learning_rate": 0.0001232493645026623, + "loss": 0.20114482879638673, + "mean_token_accuracy": 0.9317018255591393, + "num_tokens": 5133591.0, + "step": 2000 + }, + { + "entropy": 0.25918263107538225, + "epoch": 5.481927710843373, + "grad_norm": 0.38253673911094666, + "learning_rate": 0.00011779605162265297, + "loss": 0.2056061363220215, + "mean_token_accuracy": 0.9302830925583839, + "num_tokens": 5257252.0, + "step": 2050 + }, + { + "entropy": 0.2553627458959818, + "epoch": 5.615796519410977, + "grad_norm": 0.4536231458187103, + "learning_rate": 0.00011234083786347563, + "loss": 0.20531394958496094, + "mean_token_accuracy": 0.9302299374341965, + "num_tokens": 5388652.0, + "step": 2100 + }, + { + "entropy": 0.2575570110231638, + "epoch": 5.749665327978581, + "grad_norm": 0.36399731040000916, + "learning_rate": 0.00010689560125699833, + "loss": 0.2048162841796875, + "mean_token_accuracy": 0.9306997761130333, + "num_tokens": 5515488.0, + "step": 2150 + }, + { + "entropy": 0.24660897620022296, + "epoch": 5.883534136546185, + "grad_norm": 0.43602702021598816, + "learning_rate": 0.00010147219811111233, + "loss": 0.1986431884765625, + "mean_token_accuracy": 0.9335029146075249, + "num_tokens": 5644323.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.32280903935432437, + "eval_loss": 0.6383674144744873, + "eval_mean_token_accuracy": 0.849581449329853, + "eval_num_tokens": 5758458.0, + "eval_runtime": 51.3251, + "eval_samples_per_second": 31.135, + "eval_steps_per_second": 3.897, + "step": 2244 + }, + { + "entropy": 0.24638524448329752, + "epoch": 6.016064257028113, + "grad_norm": 0.3538878262042999, + "learning_rate": 9.608243719413435e-05, + "loss": 0.19203664779663085, + "mean_token_accuracy": 0.9353304363862432, + "num_tokens": 5773027.0, + "step": 2250 + }, + { + "entropy": 0.17814311504364014, + "epoch": 6.149933065595716, + "grad_norm": 0.3886161148548126, + "learning_rate": 9.07380540227205e-05, + "loss": 0.12442682266235351, + "mean_token_accuracy": 0.9582101872563362, + "num_tokens": 5904840.0, + "step": 2300 + }, + { + "entropy": 0.17237136442214251, + "epoch": 6.28380187416332, + "grad_norm": 0.38807374238967896, + "learning_rate": 8.545068530927622e-05, + "loss": 0.12445520401000977, + "mean_token_accuracy": 0.9580146077275277, + "num_tokens": 6037457.0, + "step": 2350 + }, + { + "entropy": 0.18334724467247723, + "epoch": 6.417670682730924, + "grad_norm": 0.48334068059921265, + "learning_rate": 8.023184362449975e-05, + "loss": 0.12853397369384767, + "mean_token_accuracy": 0.956232733130455, + "num_tokens": 6161042.0, + "step": 2400 + }, + { + "entropy": 0.17894859783351422, + "epoch": 6.551539491298527, + "grad_norm": 0.3343403935432434, + "learning_rate": 7.509289233022861e-05, + "loss": 0.12605968475341797, + "mean_token_accuracy": 0.9566894540190697, + "num_tokens": 6291748.0, + "step": 2450 + }, + { + "entropy": 0.17900108266621828, + "epoch": 6.685408299866131, + "grad_norm": 0.41615480184555054, + "learning_rate": 7.00450208371691e-05, + "loss": 0.12843725204467774, + "mean_token_accuracy": 0.956638223528862, + "num_tokens": 6419265.0, + "step": 2500 + }, + { + "entropy": 0.18341445792466401, + "epoch": 6.8192771084337345, + "grad_norm": 0.3722545802593231, + "learning_rate": 6.509922024138231e-05, + "loss": 0.13251185417175293, + "mean_token_accuracy": 0.9549383011460304, + "num_tokens": 6544758.0, + "step": 2550 + }, + { + "entropy": 0.17789534136652946, + "epoch": 6.953145917001339, + "grad_norm": 0.33068087697029114, + "learning_rate": 6.02662593925748e-05, + "loss": 0.126302547454834, + "mean_token_accuracy": 0.9568320420384407, + "num_tokens": 6674626.0, + "step": 2600 + }, + { + "epoch": 7.0, + "eval_entropy": 0.2639452085644007, + "eval_loss": 0.7510635852813721, + "eval_mean_token_accuracy": 0.8456276795268058, + "eval_num_tokens": 6718201.0, + "eval_runtime": 51.3435, + "eval_samples_per_second": 31.124, + "eval_steps_per_second": 3.895, + "step": 2618 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.8789404553397146e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/adapter_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8818eeae60314bb5e6cd8bb9d31975488784721f --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.0094403300459725, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "q_proj", + "gate_proj", + "o_proj", + "v_proj", + "k_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/chat_template.jinja b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/tokenizer_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/trainer_state.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d0e6b6ba6e3b30989caa6b4be5d65f59c2c24c39 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/trainer_state.json @@ -0,0 +1,712 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.0, + "eval_steps": 500, + "global_step": 2992, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.2357245934009553, + "epoch": 0.13386880856760375, + "grad_norm": 1.1940524578094482, + "learning_rate": 3.0637587087867373e-05, + "loss": 2.0509376525878906, + "mean_token_accuracy": 0.6194104523956776, + "num_tokens": 127704.0, + "step": 50 + }, + { + "entropy": 0.8588631230592728, + "epoch": 0.2677376171352075, + "grad_norm": 0.6621095538139343, + "learning_rate": 6.190043105507899e-05, + "loss": 0.8043325805664062, + "mean_token_accuracy": 0.7855384379625321, + "num_tokens": 256077.0, + "step": 100 + }, + { + "entropy": 0.6726470375061036, + "epoch": 0.40160642570281124, + "grad_norm": 0.5255736112594604, + "learning_rate": 9.316327502229059e-05, + "loss": 0.6415129089355469, + "mean_token_accuracy": 0.8183896672725678, + "num_tokens": 387735.0, + "step": 150 + }, + { + "entropy": 0.6119109469652176, + "epoch": 0.535475234270415, + "grad_norm": 0.45975860953330994, + "learning_rate": 0.0001244261189895022, + "loss": 0.576114501953125, + "mean_token_accuracy": 0.8376823288202285, + "num_tokens": 522202.0, + "step": 200 + }, + { + "entropy": 0.5972725109755993, + "epoch": 0.6693440428380187, + "grad_norm": 0.40055611729621887, + "learning_rate": 0.0001556889629567138, + "loss": 0.5613529205322265, + "mean_token_accuracy": 0.8418879929184914, + "num_tokens": 648663.0, + "step": 250 + }, + { + "entropy": 0.5673307004570961, + "epoch": 0.8032128514056225, + "grad_norm": 0.7427454590797424, + "learning_rate": 0.0001869518069239254, + "loss": 0.5325925827026368, + "mean_token_accuracy": 0.8488189685344696, + "num_tokens": 778245.0, + "step": 300 + }, + { + "entropy": 0.5615521620213986, + "epoch": 0.9370816599732262, + "grad_norm": 0.4017025828361511, + "learning_rate": 0.000218214650891137, + "loss": 0.522266616821289, + "mean_token_accuracy": 0.8500416606664658, + "num_tokens": 905328.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5758187392354012, + "eval_loss": 0.5664522051811218, + "eval_mean_token_accuracy": 0.8383084440231323, + "eval_num_tokens": 959743.0, + "eval_runtime": 51.7964, + "eval_samples_per_second": 30.852, + "eval_steps_per_second": 3.861, + "step": 374 + }, + { + "entropy": 0.5440343178883947, + "epoch": 1.069611780455154, + "grad_norm": 0.3084093928337097, + "learning_rate": 0.00023381424541885068, + "loss": 0.5051319122314453, + "mean_token_accuracy": 0.8554374280602041, + "num_tokens": 1022581.0, + "step": 400 + }, + { + "entropy": 0.5245272906124592, + "epoch": 1.2034805890227578, + "grad_norm": 0.3342743515968323, + "learning_rate": 0.00023355972972676628, + "loss": 0.4855318450927734, + "mean_token_accuracy": 0.8584430786967278, + "num_tokens": 1151430.0, + "step": 450 + }, + { + "entropy": 0.5234513898193837, + "epoch": 1.3373493975903614, + "grad_norm": 0.26953577995300293, + "learning_rate": 0.00023305125251804043, + "loss": 0.4811768341064453, + "mean_token_accuracy": 0.8606387570500373, + "num_tokens": 1282481.0, + "step": 500 + }, + { + "entropy": 0.5147616830468178, + "epoch": 1.4712182061579653, + "grad_norm": 0.28791534900665283, + "learning_rate": 0.0002322899209369128, + "loss": 0.4762062835693359, + "mean_token_accuracy": 0.8612849581241607, + "num_tokens": 1414166.0, + "step": 550 + }, + { + "entropy": 0.5060296922922134, + "epoch": 1.605087014725569, + "grad_norm": 0.2496163249015808, + "learning_rate": 0.0002312773926857543, + "loss": 0.4695176315307617, + "mean_token_accuracy": 0.8636157616972924, + "num_tokens": 1547902.0, + "step": 600 + }, + { + "entropy": 0.5008939932286739, + "epoch": 1.7389558232931726, + "grad_norm": 0.2329542636871338, + "learning_rate": 0.00023001587241563198, + "loss": 0.46317913055419924, + "mean_token_accuracy": 0.8652430367469788, + "num_tokens": 1678635.0, + "step": 650 + }, + { + "entropy": 0.4986082436144352, + "epoch": 1.8728246318607764, + "grad_norm": 0.2459402084350586, + "learning_rate": 0.00022850810692596235, + "loss": 0.4617066192626953, + "mean_token_accuracy": 0.8672981086373329, + "num_tokens": 1803328.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.546445109397173, + "eval_loss": 0.5307288765907288, + "eval_mean_token_accuracy": 0.8434162598848343, + "eval_num_tokens": 1919486.0, + "eval_runtime": 51.3627, + "eval_samples_per_second": 31.112, + "eval_steps_per_second": 3.894, + "step": 748 + }, + { + "entropy": 0.5069879525237613, + "epoch": 2.005354752342704, + "grad_norm": 0.21533548831939697, + "learning_rate": 0.00022675737918370628, + "loss": 0.4585062026977539, + "mean_token_accuracy": 0.865652882390552, + "num_tokens": 1925307.0, + "step": 750 + }, + { + "entropy": 0.4457989126443863, + "epoch": 2.139223560910308, + "grad_norm": 0.24705350399017334, + "learning_rate": 0.00022476750117512737, + "loss": 0.4026710891723633, + "mean_token_accuracy": 0.8774338760972022, + "num_tokens": 2055627.0, + "step": 800 + }, + { + "entropy": 0.45084999009966853, + "epoch": 2.2730923694779115, + "grad_norm": 0.26643863320350647, + "learning_rate": 0.00022254280560567822, + "loss": 0.40950340270996094, + "mean_token_accuracy": 0.8752825647592545, + "num_tokens": 2177850.0, + "step": 850 + }, + { + "entropy": 0.4516983331739903, + "epoch": 2.4069611780455156, + "grad_norm": 0.26972198486328125, + "learning_rate": 0.00022008813646608725, + "loss": 0.4115512466430664, + "mean_token_accuracy": 0.8761435833573341, + "num_tokens": 2304612.0, + "step": 900 + }, + { + "entropy": 0.45280722543597224, + "epoch": 2.540829986613119, + "grad_norm": 0.2643600106239319, + "learning_rate": 0.00021740883848518684, + "loss": 0.41181053161621095, + "mean_token_accuracy": 0.8756420350074768, + "num_tokens": 2430946.0, + "step": 950 + }, + { + "entropy": 0.4487862553447485, + "epoch": 2.674698795180723, + "grad_norm": 0.2849285304546356, + "learning_rate": 0.00021451074549244846, + "loss": 0.4094270706176758, + "mean_token_accuracy": 0.8771369129419326, + "num_tokens": 2557241.0, + "step": 1000 + }, + { + "entropy": 0.45000545382499696, + "epoch": 2.8085676037483265, + "grad_norm": 0.24081671237945557, + "learning_rate": 0.0002114001677155633, + "loss": 0.4073855972290039, + "mean_token_accuracy": 0.8779223081469536, + "num_tokens": 2692775.0, + "step": 1050 + }, + { + "entropy": 0.4576124830543995, + "epoch": 2.9424364123159306, + "grad_norm": 0.2341010719537735, + "learning_rate": 0.00020808387804072673, + "loss": 0.4154107666015625, + "mean_token_accuracy": 0.8756425747275353, + "num_tokens": 2823060.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.4812743577361107, + "eval_loss": 0.523208498954773, + "eval_mean_token_accuracy": 0.8494922530651092, + "eval_num_tokens": 2879229.0, + "eval_runtime": 51.3707, + "eval_samples_per_second": 31.107, + "eval_steps_per_second": 3.893, + "step": 1122 + }, + { + "entropy": 0.41342759042075183, + "epoch": 3.074966532797858, + "grad_norm": 0.34829744696617126, + "learning_rate": 0.0002045690972655427, + "loss": 0.3644887542724609, + "mean_token_accuracy": 0.8879408977850519, + "num_tokens": 2955424.0, + "step": 1150 + }, + { + "entropy": 0.3948360003530979, + "epoch": 3.208835341365462, + "grad_norm": 0.312427282333374, + "learning_rate": 0.00020086347837665854, + "loss": 0.34146129608154296, + "mean_token_accuracy": 0.8914449456334114, + "num_tokens": 3078799.0, + "step": 1200 + }, + { + "entropy": 0.38602679744362833, + "epoch": 3.3427041499330654, + "grad_norm": 0.2715625762939453, + "learning_rate": 0.0001969750898863629, + "loss": 0.34105979919433593, + "mean_token_accuracy": 0.8928995525836945, + "num_tokens": 3211945.0, + "step": 1250 + }, + { + "entropy": 0.3923524462431669, + "epoch": 3.4765729585006695, + "grad_norm": 0.27295640110969543, + "learning_rate": 0.00019291239826442992, + "loss": 0.3473458099365234, + "mean_token_accuracy": 0.8913829082250595, + "num_tokens": 3343933.0, + "step": 1300 + }, + { + "entropy": 0.40172914519906044, + "epoch": 3.610441767068273, + "grad_norm": 0.32684192061424255, + "learning_rate": 0.0001886842495034615, + "loss": 0.35543827056884764, + "mean_token_accuracy": 0.8904094022512435, + "num_tokens": 3470087.0, + "step": 1350 + }, + { + "entropy": 0.4010512103140354, + "epoch": 3.7443105756358768, + "grad_norm": 0.23922984302043915, + "learning_rate": 0.00018429984985786734, + "loss": 0.3535212326049805, + "mean_token_accuracy": 0.8891122484207153, + "num_tokens": 3590858.0, + "step": 1400 + }, + { + "entropy": 0.39226082623004915, + "epoch": 3.878179384203481, + "grad_norm": 0.25130993127822876, + "learning_rate": 0.00017976874579842046, + "loss": 0.3484851837158203, + "mean_token_accuracy": 0.8916165816783905, + "num_tokens": 3725806.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.45332007586956025, + "eval_loss": 0.5243425965309143, + "eval_mean_token_accuracy": 0.8532214590907097, + "eval_num_tokens": 3838972.0, + "eval_runtime": 51.3837, + "eval_samples_per_second": 31.099, + "eval_steps_per_second": 3.892, + "step": 1496 + }, + { + "entropy": 0.39358587043754983, + "epoch": 4.010709504685408, + "grad_norm": 0.27960067987442017, + "learning_rate": 0.0001751008032260355, + "loss": 0.34616813659667967, + "mean_token_accuracy": 0.8923709010234987, + "num_tokens": 3849380.0, + "step": 1500 + }, + { + "entropy": 0.3206369188427925, + "epoch": 4.144578313253012, + "grad_norm": 0.37261858582496643, + "learning_rate": 0.00017030618599002818, + "loss": 0.2684581565856934, + "mean_token_accuracy": 0.9131761506199837, + "num_tokens": 3976694.0, + "step": 1550 + }, + { + "entropy": 0.3254615054279566, + "epoch": 4.278447121820616, + "grad_norm": 0.39803799986839294, + "learning_rate": 0.00016539533375763032, + "loss": 0.2769618606567383, + "mean_token_accuracy": 0.9102409112453461, + "num_tokens": 4110204.0, + "step": 1600 + }, + { + "entropy": 0.32003962114453316, + "epoch": 4.412315930388219, + "grad_norm": 0.35707366466522217, + "learning_rate": 0.0001603789392829468, + "loss": 0.2749842834472656, + "mean_token_accuracy": 0.910626070201397, + "num_tokens": 4240883.0, + "step": 1650 + }, + { + "entropy": 0.32672152675688265, + "epoch": 4.546184738955823, + "grad_norm": 0.47052621841430664, + "learning_rate": 0.00015526792512484774, + "loss": 0.27983531951904295, + "mean_token_accuracy": 0.9093958771228791, + "num_tokens": 4365381.0, + "step": 1700 + }, + { + "entropy": 0.33449163861572745, + "epoch": 4.680053547523427, + "grad_norm": 0.330709844827652, + "learning_rate": 0.00015007341986449012, + "loss": 0.28533639907836916, + "mean_token_accuracy": 0.9082232251763344, + "num_tokens": 4490711.0, + "step": 1750 + }, + { + "entropy": 0.33353066638112067, + "epoch": 4.813922356091031, + "grad_norm": 0.3990134298801422, + "learning_rate": 0.00014480673387425272, + "loss": 0.28489078521728517, + "mean_token_accuracy": 0.908001911342144, + "num_tokens": 4618532.0, + "step": 1800 + }, + { + "entropy": 0.3272412090748549, + "epoch": 4.947791164658635, + "grad_norm": 0.3183020353317261, + "learning_rate": 0.00013947933469084315, + "loss": 0.2772365379333496, + "mean_token_accuracy": 0.908946952521801, + "num_tokens": 4752261.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.3932068006694317, + "eval_loss": 0.5647156834602356, + "eval_mean_token_accuracy": 0.85078628718853, + "eval_num_tokens": 4798715.0, + "eval_runtime": 51.3578, + "eval_samples_per_second": 31.115, + "eval_steps_per_second": 3.894, + "step": 1870 + }, + { + "entropy": 0.2832252390005372, + "epoch": 5.080321285140562, + "grad_norm": 0.3697633147239685, + "learning_rate": 0.00013410282204620014, + "loss": 0.2279021453857422, + "mean_token_accuracy": 0.9252248072262966, + "num_tokens": 4879271.0, + "step": 1900 + }, + { + "entropy": 0.250804705247283, + "epoch": 5.214190093708166, + "grad_norm": 0.3890162706375122, + "learning_rate": 0.00012868890261055722, + "loss": 0.1980854606628418, + "mean_token_accuracy": 0.9338876655697823, + "num_tokens": 5005076.0, + "step": 1950 + }, + { + "entropy": 0.2531572911888361, + "epoch": 5.34805890227577, + "grad_norm": 0.43466225266456604, + "learning_rate": 0.0001232493645026623, + "loss": 0.20114482879638673, + "mean_token_accuracy": 0.9317018255591393, + "num_tokens": 5133591.0, + "step": 2000 + }, + { + "entropy": 0.25918263107538225, + "epoch": 5.481927710843373, + "grad_norm": 0.38253673911094666, + "learning_rate": 0.00011779605162265297, + "loss": 0.2056061363220215, + "mean_token_accuracy": 0.9302830925583839, + "num_tokens": 5257252.0, + "step": 2050 + }, + { + "entropy": 0.2553627458959818, + "epoch": 5.615796519410977, + "grad_norm": 0.4536231458187103, + "learning_rate": 0.00011234083786347563, + "loss": 0.20531394958496094, + "mean_token_accuracy": 0.9302299374341965, + "num_tokens": 5388652.0, + "step": 2100 + }, + { + "entropy": 0.2575570110231638, + "epoch": 5.749665327978581, + "grad_norm": 0.36399731040000916, + "learning_rate": 0.00010689560125699833, + "loss": 0.2048162841796875, + "mean_token_accuracy": 0.9306997761130333, + "num_tokens": 5515488.0, + "step": 2150 + }, + { + "entropy": 0.24660897620022296, + "epoch": 5.883534136546185, + "grad_norm": 0.43602702021598816, + "learning_rate": 0.00010147219811111233, + "loss": 0.1986431884765625, + "mean_token_accuracy": 0.9335029146075249, + "num_tokens": 5644323.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.32280903935432437, + "eval_loss": 0.6383674144744873, + "eval_mean_token_accuracy": 0.849581449329853, + "eval_num_tokens": 5758458.0, + "eval_runtime": 51.3251, + "eval_samples_per_second": 31.135, + "eval_steps_per_second": 3.897, + "step": 2244 + }, + { + "entropy": 0.24638524448329752, + "epoch": 6.016064257028113, + "grad_norm": 0.3538878262042999, + "learning_rate": 9.608243719413435e-05, + "loss": 0.19203664779663085, + "mean_token_accuracy": 0.9353304363862432, + "num_tokens": 5773027.0, + "step": 2250 + }, + { + "entropy": 0.17814311504364014, + "epoch": 6.149933065595716, + "grad_norm": 0.3886161148548126, + "learning_rate": 9.07380540227205e-05, + "loss": 0.12442682266235351, + "mean_token_accuracy": 0.9582101872563362, + "num_tokens": 5904840.0, + "step": 2300 + }, + { + "entropy": 0.17237136442214251, + "epoch": 6.28380187416332, + "grad_norm": 0.38807374238967896, + "learning_rate": 8.545068530927622e-05, + "loss": 0.12445520401000977, + "mean_token_accuracy": 0.9580146077275277, + "num_tokens": 6037457.0, + "step": 2350 + }, + { + "entropy": 0.18334724467247723, + "epoch": 6.417670682730924, + "grad_norm": 0.48334068059921265, + "learning_rate": 8.023184362449975e-05, + "loss": 0.12853397369384767, + "mean_token_accuracy": 0.956232733130455, + "num_tokens": 6161042.0, + "step": 2400 + }, + { + "entropy": 0.17894859783351422, + "epoch": 6.551539491298527, + "grad_norm": 0.3343403935432434, + "learning_rate": 7.509289233022861e-05, + "loss": 0.12605968475341797, + "mean_token_accuracy": 0.9566894540190697, + "num_tokens": 6291748.0, + "step": 2450 + }, + { + "entropy": 0.17900108266621828, + "epoch": 6.685408299866131, + "grad_norm": 0.41615480184555054, + "learning_rate": 7.00450208371691e-05, + "loss": 0.12843725204467774, + "mean_token_accuracy": 0.956638223528862, + "num_tokens": 6419265.0, + "step": 2500 + }, + { + "entropy": 0.18341445792466401, + "epoch": 6.8192771084337345, + "grad_norm": 0.3722545802593231, + "learning_rate": 6.509922024138231e-05, + "loss": 0.13251185417175293, + "mean_token_accuracy": 0.9549383011460304, + "num_tokens": 6544758.0, + "step": 2550 + }, + { + "entropy": 0.17789534136652946, + "epoch": 6.953145917001339, + "grad_norm": 0.33068087697029114, + "learning_rate": 6.02662593925748e-05, + "loss": 0.126302547454834, + "mean_token_accuracy": 0.9568320420384407, + "num_tokens": 6674626.0, + "step": 2600 + }, + { + "epoch": 7.0, + "eval_entropy": 0.2639452085644007, + "eval_loss": 0.7510635852813721, + "eval_mean_token_accuracy": 0.8456276795268058, + "eval_num_tokens": 6718201.0, + "eval_runtime": 51.3435, + "eval_samples_per_second": 31.124, + "eval_steps_per_second": 3.895, + "step": 2618 + }, + { + "entropy": 0.15149398003187445, + "epoch": 7.085676037483267, + "grad_norm": 0.3362366855144501, + "learning_rate": 5.5556661446302733e-05, + "loss": 0.09618576049804688, + "mean_token_accuracy": 0.9674260706612559, + "num_tokens": 6803830.0, + "step": 2650 + }, + { + "entropy": 0.1279136904887855, + "epoch": 7.21954484605087, + "grad_norm": 0.2925446033477783, + "learning_rate": 5.0980680951143166e-05, + "loss": 0.07902004718780517, + "mean_token_accuracy": 0.9733691918849945, + "num_tokens": 6936289.0, + "step": 2700 + }, + { + "entropy": 0.13701909594237804, + "epoch": 7.353413654618474, + "grad_norm": 0.2659797668457031, + "learning_rate": 4.6548281520723104e-05, + "loss": 0.08250561714172364, + "mean_token_accuracy": 0.971816695034504, + "num_tokens": 7057823.0, + "step": 2750 + }, + { + "entropy": 0.12771729078143834, + "epoch": 7.4872824631860775, + "grad_norm": 0.34247028827667236, + "learning_rate": 4.2269114139222296e-05, + "loss": 0.08109721183776855, + "mean_token_accuracy": 0.9735026282072067, + "num_tokens": 7187020.0, + "step": 2800 + }, + { + "entropy": 0.12350119687616826, + "epoch": 7.621151271753681, + "grad_norm": 0.40673893690109253, + "learning_rate": 3.8152496147586614e-05, + "loss": 0.07707037448883057, + "mean_token_accuracy": 0.9743763041496277, + "num_tokens": 7323457.0, + "step": 2850 + }, + { + "entropy": 0.12642662361264229, + "epoch": 7.755020080321285, + "grad_norm": 0.3064998984336853, + "learning_rate": 3.4207390956206875e-05, + "loss": 0.07985133647918702, + "mean_token_accuracy": 0.9733496251702308, + "num_tokens": 7451765.0, + "step": 2900 + }, + { + "entropy": 0.1269074559956789, + "epoch": 7.888888888888889, + "grad_norm": 0.24833732843399048, + "learning_rate": 3.0442388528236647e-05, + "loss": 0.0821513843536377, + "mean_token_accuracy": 0.9733479696512223, + "num_tokens": 7578222.0, + "step": 2950 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2316589906811714, + "eval_loss": 0.851987898349762, + "eval_mean_token_accuracy": 0.8467304027080536, + "eval_num_tokens": 7677944.0, + "eval_runtime": 51.3501, + "eval_samples_per_second": 31.12, + "eval_steps_per_second": 3.895, + "step": 2992 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.2928370910154854e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/adapter_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8818eeae60314bb5e6cd8bb9d31975488784721f --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.0094403300459725, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "q_proj", + "gate_proj", + "o_proj", + "v_proj", + "k_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/chat_template.jinja b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/tokenizer_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/trainer_state.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..54835173496a2e4cee9b6d7268fd47ce2694cd87 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/trainer_state.json @@ -0,0 +1,803 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.0, + "eval_steps": 500, + "global_step": 3366, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.2357245934009553, + "epoch": 0.13386880856760375, + "grad_norm": 1.1940524578094482, + "learning_rate": 3.0637587087867373e-05, + "loss": 2.0509376525878906, + "mean_token_accuracy": 0.6194104523956776, + "num_tokens": 127704.0, + "step": 50 + }, + { + "entropy": 0.8588631230592728, + "epoch": 0.2677376171352075, + "grad_norm": 0.6621095538139343, + "learning_rate": 6.190043105507899e-05, + "loss": 0.8043325805664062, + "mean_token_accuracy": 0.7855384379625321, + "num_tokens": 256077.0, + "step": 100 + }, + { + "entropy": 0.6726470375061036, + "epoch": 0.40160642570281124, + "grad_norm": 0.5255736112594604, + "learning_rate": 9.316327502229059e-05, + "loss": 0.6415129089355469, + "mean_token_accuracy": 0.8183896672725678, + "num_tokens": 387735.0, + "step": 150 + }, + { + "entropy": 0.6119109469652176, + "epoch": 0.535475234270415, + "grad_norm": 0.45975860953330994, + "learning_rate": 0.0001244261189895022, + "loss": 0.576114501953125, + "mean_token_accuracy": 0.8376823288202285, + "num_tokens": 522202.0, + "step": 200 + }, + { + "entropy": 0.5972725109755993, + "epoch": 0.6693440428380187, + "grad_norm": 0.40055611729621887, + "learning_rate": 0.0001556889629567138, + "loss": 0.5613529205322265, + "mean_token_accuracy": 0.8418879929184914, + "num_tokens": 648663.0, + "step": 250 + }, + { + "entropy": 0.5673307004570961, + "epoch": 0.8032128514056225, + "grad_norm": 0.7427454590797424, + "learning_rate": 0.0001869518069239254, + "loss": 0.5325925827026368, + "mean_token_accuracy": 0.8488189685344696, + "num_tokens": 778245.0, + "step": 300 + }, + { + "entropy": 0.5615521620213986, + "epoch": 0.9370816599732262, + "grad_norm": 0.4017025828361511, + "learning_rate": 0.000218214650891137, + "loss": 0.522266616821289, + "mean_token_accuracy": 0.8500416606664658, + "num_tokens": 905328.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5758187392354012, + "eval_loss": 0.5664522051811218, + "eval_mean_token_accuracy": 0.8383084440231323, + "eval_num_tokens": 959743.0, + "eval_runtime": 51.7964, + "eval_samples_per_second": 30.852, + "eval_steps_per_second": 3.861, + "step": 374 + }, + { + "entropy": 0.5440343178883947, + "epoch": 1.069611780455154, + "grad_norm": 0.3084093928337097, + "learning_rate": 0.00023381424541885068, + "loss": 0.5051319122314453, + "mean_token_accuracy": 0.8554374280602041, + "num_tokens": 1022581.0, + "step": 400 + }, + { + "entropy": 0.5245272906124592, + "epoch": 1.2034805890227578, + "grad_norm": 0.3342743515968323, + "learning_rate": 0.00023355972972676628, + "loss": 0.4855318450927734, + "mean_token_accuracy": 0.8584430786967278, + "num_tokens": 1151430.0, + "step": 450 + }, + { + "entropy": 0.5234513898193837, + "epoch": 1.3373493975903614, + "grad_norm": 0.26953577995300293, + "learning_rate": 0.00023305125251804043, + "loss": 0.4811768341064453, + "mean_token_accuracy": 0.8606387570500373, + "num_tokens": 1282481.0, + "step": 500 + }, + { + "entropy": 0.5147616830468178, + "epoch": 1.4712182061579653, + "grad_norm": 0.28791534900665283, + "learning_rate": 0.0002322899209369128, + "loss": 0.4762062835693359, + "mean_token_accuracy": 0.8612849581241607, + "num_tokens": 1414166.0, + "step": 550 + }, + { + "entropy": 0.5060296922922134, + "epoch": 1.605087014725569, + "grad_norm": 0.2496163249015808, + "learning_rate": 0.0002312773926857543, + "loss": 0.4695176315307617, + "mean_token_accuracy": 0.8636157616972924, + "num_tokens": 1547902.0, + "step": 600 + }, + { + "entropy": 0.5008939932286739, + "epoch": 1.7389558232931726, + "grad_norm": 0.2329542636871338, + "learning_rate": 0.00023001587241563198, + "loss": 0.46317913055419924, + "mean_token_accuracy": 0.8652430367469788, + "num_tokens": 1678635.0, + "step": 650 + }, + { + "entropy": 0.4986082436144352, + "epoch": 1.8728246318607764, + "grad_norm": 0.2459402084350586, + "learning_rate": 0.00022850810692596235, + "loss": 0.4617066192626953, + "mean_token_accuracy": 0.8672981086373329, + "num_tokens": 1803328.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.546445109397173, + "eval_loss": 0.5307288765907288, + "eval_mean_token_accuracy": 0.8434162598848343, + "eval_num_tokens": 1919486.0, + "eval_runtime": 51.3627, + "eval_samples_per_second": 31.112, + "eval_steps_per_second": 3.894, + "step": 748 + }, + { + "entropy": 0.5069879525237613, + "epoch": 2.005354752342704, + "grad_norm": 0.21533548831939697, + "learning_rate": 0.00022675737918370628, + "loss": 0.4585062026977539, + "mean_token_accuracy": 0.865652882390552, + "num_tokens": 1925307.0, + "step": 750 + }, + { + "entropy": 0.4457989126443863, + "epoch": 2.139223560910308, + "grad_norm": 0.24705350399017334, + "learning_rate": 0.00022476750117512737, + "loss": 0.4026710891723633, + "mean_token_accuracy": 0.8774338760972022, + "num_tokens": 2055627.0, + "step": 800 + }, + { + "entropy": 0.45084999009966853, + "epoch": 2.2730923694779115, + "grad_norm": 0.26643863320350647, + "learning_rate": 0.00022254280560567822, + "loss": 0.40950340270996094, + "mean_token_accuracy": 0.8752825647592545, + "num_tokens": 2177850.0, + "step": 850 + }, + { + "entropy": 0.4516983331739903, + "epoch": 2.4069611780455156, + "grad_norm": 0.26972198486328125, + "learning_rate": 0.00022008813646608725, + "loss": 0.4115512466430664, + "mean_token_accuracy": 0.8761435833573341, + "num_tokens": 2304612.0, + "step": 900 + }, + { + "entropy": 0.45280722543597224, + "epoch": 2.540829986613119, + "grad_norm": 0.2643600106239319, + "learning_rate": 0.00021740883848518684, + "loss": 0.41181053161621095, + "mean_token_accuracy": 0.8756420350074768, + "num_tokens": 2430946.0, + "step": 950 + }, + { + "entropy": 0.4487862553447485, + "epoch": 2.674698795180723, + "grad_norm": 0.2849285304546356, + "learning_rate": 0.00021451074549244846, + "loss": 0.4094270706176758, + "mean_token_accuracy": 0.8771369129419326, + "num_tokens": 2557241.0, + "step": 1000 + }, + { + "entropy": 0.45000545382499696, + "epoch": 2.8085676037483265, + "grad_norm": 0.24081671237945557, + "learning_rate": 0.0002114001677155633, + "loss": 0.4073855972290039, + "mean_token_accuracy": 0.8779223081469536, + "num_tokens": 2692775.0, + "step": 1050 + }, + { + "entropy": 0.4576124830543995, + "epoch": 2.9424364123159306, + "grad_norm": 0.2341010719537735, + "learning_rate": 0.00020808387804072673, + "loss": 0.4154107666015625, + "mean_token_accuracy": 0.8756425747275353, + "num_tokens": 2823060.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.4812743577361107, + "eval_loss": 0.523208498954773, + "eval_mean_token_accuracy": 0.8494922530651092, + "eval_num_tokens": 2879229.0, + "eval_runtime": 51.3707, + "eval_samples_per_second": 31.107, + "eval_steps_per_second": 3.893, + "step": 1122 + }, + { + "entropy": 0.41342759042075183, + "epoch": 3.074966532797858, + "grad_norm": 0.34829744696617126, + "learning_rate": 0.0002045690972655427, + "loss": 0.3644887542724609, + "mean_token_accuracy": 0.8879408977850519, + "num_tokens": 2955424.0, + "step": 1150 + }, + { + "entropy": 0.3948360003530979, + "epoch": 3.208835341365462, + "grad_norm": 0.312427282333374, + "learning_rate": 0.00020086347837665854, + "loss": 0.34146129608154296, + "mean_token_accuracy": 0.8914449456334114, + "num_tokens": 3078799.0, + "step": 1200 + }, + { + "entropy": 0.38602679744362833, + "epoch": 3.3427041499330654, + "grad_norm": 0.2715625762939453, + "learning_rate": 0.0001969750898863629, + "loss": 0.34105979919433593, + "mean_token_accuracy": 0.8928995525836945, + "num_tokens": 3211945.0, + "step": 1250 + }, + { + "entropy": 0.3923524462431669, + "epoch": 3.4765729585006695, + "grad_norm": 0.27295640110969543, + "learning_rate": 0.00019291239826442992, + "loss": 0.3473458099365234, + "mean_token_accuracy": 0.8913829082250595, + "num_tokens": 3343933.0, + "step": 1300 + }, + { + "entropy": 0.40172914519906044, + "epoch": 3.610441767068273, + "grad_norm": 0.32684192061424255, + "learning_rate": 0.0001886842495034615, + "loss": 0.35543827056884764, + "mean_token_accuracy": 0.8904094022512435, + "num_tokens": 3470087.0, + "step": 1350 + }, + { + "entropy": 0.4010512103140354, + "epoch": 3.7443105756358768, + "grad_norm": 0.23922984302043915, + "learning_rate": 0.00018429984985786734, + "loss": 0.3535212326049805, + "mean_token_accuracy": 0.8891122484207153, + "num_tokens": 3590858.0, + "step": 1400 + }, + { + "entropy": 0.39226082623004915, + "epoch": 3.878179384203481, + "grad_norm": 0.25130993127822876, + "learning_rate": 0.00017976874579842046, + "loss": 0.3484851837158203, + "mean_token_accuracy": 0.8916165816783905, + "num_tokens": 3725806.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.45332007586956025, + "eval_loss": 0.5243425965309143, + "eval_mean_token_accuracy": 0.8532214590907097, + "eval_num_tokens": 3838972.0, + "eval_runtime": 51.3837, + "eval_samples_per_second": 31.099, + "eval_steps_per_second": 3.892, + "step": 1496 + }, + { + "entropy": 0.39358587043754983, + "epoch": 4.010709504685408, + "grad_norm": 0.27960067987442017, + "learning_rate": 0.0001751008032260355, + "loss": 0.34616813659667967, + "mean_token_accuracy": 0.8923709010234987, + "num_tokens": 3849380.0, + "step": 1500 + }, + { + "entropy": 0.3206369188427925, + "epoch": 4.144578313253012, + "grad_norm": 0.37261858582496643, + "learning_rate": 0.00017030618599002818, + "loss": 0.2684581565856934, + "mean_token_accuracy": 0.9131761506199837, + "num_tokens": 3976694.0, + "step": 1550 + }, + { + "entropy": 0.3254615054279566, + "epoch": 4.278447121820616, + "grad_norm": 0.39803799986839294, + "learning_rate": 0.00016539533375763032, + "loss": 0.2769618606567383, + "mean_token_accuracy": 0.9102409112453461, + "num_tokens": 4110204.0, + "step": 1600 + }, + { + "entropy": 0.32003962114453316, + "epoch": 4.412315930388219, + "grad_norm": 0.35707366466522217, + "learning_rate": 0.0001603789392829468, + "loss": 0.2749842834472656, + "mean_token_accuracy": 0.910626070201397, + "num_tokens": 4240883.0, + "step": 1650 + }, + { + "entropy": 0.32672152675688265, + "epoch": 4.546184738955823, + "grad_norm": 0.47052621841430664, + "learning_rate": 0.00015526792512484774, + "loss": 0.27983531951904295, + "mean_token_accuracy": 0.9093958771228791, + "num_tokens": 4365381.0, + "step": 1700 + }, + { + "entropy": 0.33449163861572745, + "epoch": 4.680053547523427, + "grad_norm": 0.330709844827652, + "learning_rate": 0.00015007341986449012, + "loss": 0.28533639907836916, + "mean_token_accuracy": 0.9082232251763344, + "num_tokens": 4490711.0, + "step": 1750 + }, + { + "entropy": 0.33353066638112067, + "epoch": 4.813922356091031, + "grad_norm": 0.3990134298801422, + "learning_rate": 0.00014480673387425272, + "loss": 0.28489078521728517, + "mean_token_accuracy": 0.908001911342144, + "num_tokens": 4618532.0, + "step": 1800 + }, + { + "entropy": 0.3272412090748549, + "epoch": 4.947791164658635, + "grad_norm": 0.3183020353317261, + "learning_rate": 0.00013947933469084315, + "loss": 0.2772365379333496, + "mean_token_accuracy": 0.908946952521801, + "num_tokens": 4752261.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.3932068006694317, + "eval_loss": 0.5647156834602356, + "eval_mean_token_accuracy": 0.85078628718853, + "eval_num_tokens": 4798715.0, + "eval_runtime": 51.3578, + "eval_samples_per_second": 31.115, + "eval_steps_per_second": 3.894, + "step": 1870 + }, + { + "entropy": 0.2832252390005372, + "epoch": 5.080321285140562, + "grad_norm": 0.3697633147239685, + "learning_rate": 0.00013410282204620014, + "loss": 0.2279021453857422, + "mean_token_accuracy": 0.9252248072262966, + "num_tokens": 4879271.0, + "step": 1900 + }, + { + "entropy": 0.250804705247283, + "epoch": 5.214190093708166, + "grad_norm": 0.3890162706375122, + "learning_rate": 0.00012868890261055722, + "loss": 0.1980854606628418, + "mean_token_accuracy": 0.9338876655697823, + "num_tokens": 5005076.0, + "step": 1950 + }, + { + "entropy": 0.2531572911888361, + "epoch": 5.34805890227577, + "grad_norm": 0.43466225266456604, + "learning_rate": 0.0001232493645026623, + "loss": 0.20114482879638673, + "mean_token_accuracy": 0.9317018255591393, + "num_tokens": 5133591.0, + "step": 2000 + }, + { + "entropy": 0.25918263107538225, + "epoch": 5.481927710843373, + "grad_norm": 0.38253673911094666, + "learning_rate": 0.00011779605162265297, + "loss": 0.2056061363220215, + "mean_token_accuracy": 0.9302830925583839, + "num_tokens": 5257252.0, + "step": 2050 + }, + { + "entropy": 0.2553627458959818, + "epoch": 5.615796519410977, + "grad_norm": 0.4536231458187103, + "learning_rate": 0.00011234083786347563, + "loss": 0.20531394958496094, + "mean_token_accuracy": 0.9302299374341965, + "num_tokens": 5388652.0, + "step": 2100 + }, + { + "entropy": 0.2575570110231638, + "epoch": 5.749665327978581, + "grad_norm": 0.36399731040000916, + "learning_rate": 0.00010689560125699833, + "loss": 0.2048162841796875, + "mean_token_accuracy": 0.9306997761130333, + "num_tokens": 5515488.0, + "step": 2150 + }, + { + "entropy": 0.24660897620022296, + "epoch": 5.883534136546185, + "grad_norm": 0.43602702021598816, + "learning_rate": 0.00010147219811111233, + "loss": 0.1986431884765625, + "mean_token_accuracy": 0.9335029146075249, + "num_tokens": 5644323.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.32280903935432437, + "eval_loss": 0.6383674144744873, + "eval_mean_token_accuracy": 0.849581449329853, + "eval_num_tokens": 5758458.0, + "eval_runtime": 51.3251, + "eval_samples_per_second": 31.135, + "eval_steps_per_second": 3.897, + "step": 2244 + }, + { + "entropy": 0.24638524448329752, + "epoch": 6.016064257028113, + "grad_norm": 0.3538878262042999, + "learning_rate": 9.608243719413435e-05, + "loss": 0.19203664779663085, + "mean_token_accuracy": 0.9353304363862432, + "num_tokens": 5773027.0, + "step": 2250 + }, + { + "entropy": 0.17814311504364014, + "epoch": 6.149933065595716, + "grad_norm": 0.3886161148548126, + "learning_rate": 9.07380540227205e-05, + "loss": 0.12442682266235351, + "mean_token_accuracy": 0.9582101872563362, + "num_tokens": 5904840.0, + "step": 2300 + }, + { + "entropy": 0.17237136442214251, + "epoch": 6.28380187416332, + "grad_norm": 0.38807374238967896, + "learning_rate": 8.545068530927622e-05, + "loss": 0.12445520401000977, + "mean_token_accuracy": 0.9580146077275277, + "num_tokens": 6037457.0, + "step": 2350 + }, + { + "entropy": 0.18334724467247723, + "epoch": 6.417670682730924, + "grad_norm": 0.48334068059921265, + "learning_rate": 8.023184362449975e-05, + "loss": 0.12853397369384767, + "mean_token_accuracy": 0.956232733130455, + "num_tokens": 6161042.0, + "step": 2400 + }, + { + "entropy": 0.17894859783351422, + "epoch": 6.551539491298527, + "grad_norm": 0.3343403935432434, + "learning_rate": 7.509289233022861e-05, + "loss": 0.12605968475341797, + "mean_token_accuracy": 0.9566894540190697, + "num_tokens": 6291748.0, + "step": 2450 + }, + { + "entropy": 0.17900108266621828, + "epoch": 6.685408299866131, + "grad_norm": 0.41615480184555054, + "learning_rate": 7.00450208371691e-05, + "loss": 0.12843725204467774, + "mean_token_accuracy": 0.956638223528862, + "num_tokens": 6419265.0, + "step": 2500 + }, + { + "entropy": 0.18341445792466401, + "epoch": 6.8192771084337345, + "grad_norm": 0.3722545802593231, + "learning_rate": 6.509922024138231e-05, + "loss": 0.13251185417175293, + "mean_token_accuracy": 0.9549383011460304, + "num_tokens": 6544758.0, + "step": 2550 + }, + { + "entropy": 0.17789534136652946, + "epoch": 6.953145917001339, + "grad_norm": 0.33068087697029114, + "learning_rate": 6.02662593925748e-05, + "loss": 0.126302547454834, + "mean_token_accuracy": 0.9568320420384407, + "num_tokens": 6674626.0, + "step": 2600 + }, + { + "epoch": 7.0, + "eval_entropy": 0.2639452085644007, + "eval_loss": 0.7510635852813721, + "eval_mean_token_accuracy": 0.8456276795268058, + "eval_num_tokens": 6718201.0, + "eval_runtime": 51.3435, + "eval_samples_per_second": 31.124, + "eval_steps_per_second": 3.895, + "step": 2618 + }, + { + "entropy": 0.15149398003187445, + "epoch": 7.085676037483267, + "grad_norm": 0.3362366855144501, + "learning_rate": 5.5556661446302733e-05, + "loss": 0.09618576049804688, + "mean_token_accuracy": 0.9674260706612559, + "num_tokens": 6803830.0, + "step": 2650 + }, + { + "entropy": 0.1279136904887855, + "epoch": 7.21954484605087, + "grad_norm": 0.2925446033477783, + "learning_rate": 5.0980680951143166e-05, + "loss": 0.07902004718780517, + "mean_token_accuracy": 0.9733691918849945, + "num_tokens": 6936289.0, + "step": 2700 + }, + { + "entropy": 0.13701909594237804, + "epoch": 7.353413654618474, + "grad_norm": 0.2659797668457031, + "learning_rate": 4.6548281520723104e-05, + "loss": 0.08250561714172364, + "mean_token_accuracy": 0.971816695034504, + "num_tokens": 7057823.0, + "step": 2750 + }, + { + "entropy": 0.12771729078143834, + "epoch": 7.4872824631860775, + "grad_norm": 0.34247028827667236, + "learning_rate": 4.2269114139222296e-05, + "loss": 0.08109721183776855, + "mean_token_accuracy": 0.9735026282072067, + "num_tokens": 7187020.0, + "step": 2800 + }, + { + "entropy": 0.12350119687616826, + "epoch": 7.621151271753681, + "grad_norm": 0.40673893690109253, + "learning_rate": 3.8152496147586614e-05, + "loss": 0.07707037448883057, + "mean_token_accuracy": 0.9743763041496277, + "num_tokens": 7323457.0, + "step": 2850 + }, + { + "entropy": 0.12642662361264229, + "epoch": 7.755020080321285, + "grad_norm": 0.3064998984336853, + "learning_rate": 3.4207390956206875e-05, + "loss": 0.07985133647918702, + "mean_token_accuracy": 0.9733496251702308, + "num_tokens": 7451765.0, + "step": 2900 + }, + { + "entropy": 0.1269074559956789, + "epoch": 7.888888888888889, + "grad_norm": 0.24833732843399048, + "learning_rate": 3.0442388528236647e-05, + "loss": 0.0821513843536377, + "mean_token_accuracy": 0.9733479696512223, + "num_tokens": 7578222.0, + "step": 2950 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2316589906811714, + "eval_loss": 0.851987898349762, + "eval_mean_token_accuracy": 0.8467304027080536, + "eval_num_tokens": 7677944.0, + "eval_runtime": 51.3501, + "eval_samples_per_second": 31.12, + "eval_steps_per_second": 3.895, + "step": 2992 + }, + { + "entropy": 0.13051860011888272, + "epoch": 8.021419009370817, + "grad_norm": 0.13740092515945435, + "learning_rate": 2.686568667604363e-05, + "loss": 0.08182425498962402, + "mean_token_accuracy": 0.9728358243450974, + "num_tokens": 7699511.0, + "step": 3000 + }, + { + "entropy": 0.11715538412332535, + "epoch": 8.15528781793842, + "grad_norm": 0.22101899981498718, + "learning_rate": 2.3485073211519044e-05, + "loss": 0.06548665523529053, + "mean_token_accuracy": 0.9781955161690712, + "num_tokens": 7822731.0, + "step": 3050 + }, + { + "entropy": 0.10994385546073318, + "epoch": 8.289156626506024, + "grad_norm": 0.18131813406944275, + "learning_rate": 2.0307908989111124e-05, + "loss": 0.06045622825622558, + "mean_token_accuracy": 0.9784942081570626, + "num_tokens": 7954237.0, + "step": 3100 + }, + { + "entropy": 0.10547435775399208, + "epoch": 8.423025435073628, + "grad_norm": 0.219278946518898, + "learning_rate": 1.734111187850385e-05, + "loss": 0.06194626808166504, + "mean_token_accuracy": 0.979062694311142, + "num_tokens": 8082999.0, + "step": 3150 + }, + { + "entropy": 0.10694314314052462, + "epoch": 8.556894243641231, + "grad_norm": 0.21804827451705933, + "learning_rate": 1.4591141701838324e-05, + "loss": 0.06162384033203125, + "mean_token_accuracy": 0.9781702619791031, + "num_tokens": 8214329.0, + "step": 3200 + }, + { + "entropy": 0.10566199742257595, + "epoch": 8.690763052208835, + "grad_norm": 0.12687279284000397, + "learning_rate": 1.2063986168274383e-05, + "loss": 0.06172010898590088, + "mean_token_accuracy": 0.9794953766465188, + "num_tokens": 8345134.0, + "step": 3250 + }, + { + "entropy": 0.11023524977266788, + "epoch": 8.824631860776439, + "grad_norm": 0.14195536077022552, + "learning_rate": 9.765147836518029e-06, + "loss": 0.06327592372894288, + "mean_token_accuracy": 0.9780235534906387, + "num_tokens": 8470788.0, + "step": 3300 + }, + { + "entropy": 0.11061601843684912, + "epoch": 8.958500669344042, + "grad_norm": 0.17863383889198303, + "learning_rate": 7.699632133701809e-06, + "loss": 0.0625003719329834, + "mean_token_accuracy": 0.9776859974861145, + "num_tokens": 8599154.0, + "step": 3350 + }, + { + "epoch": 9.0, + "eval_entropy": 0.2094542646408081, + "eval_loss": 0.9543951153755188, + "eval_mean_token_accuracy": 0.8449241068959236, + "eval_num_tokens": 8637687.0, + "eval_runtime": 51.3693, + "eval_samples_per_second": 31.108, + "eval_steps_per_second": 3.893, + "step": 3366 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.702890522149509e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/adapter_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8818eeae60314bb5e6cd8bb9d31975488784721f --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.0094403300459725, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "q_proj", + "gate_proj", + "o_proj", + "v_proj", + "k_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/chat_template.jinja b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/tokenizer_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/trainer_state.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9ffcde7d23349a3f5589aa30f82fa513a4c60539 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/trainer_state.json @@ -0,0 +1,115 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 374, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.2357245934009553, + "epoch": 0.13386880856760375, + "grad_norm": 1.1940524578094482, + "learning_rate": 3.0637587087867373e-05, + "loss": 2.0509376525878906, + "mean_token_accuracy": 0.6194104523956776, + "num_tokens": 127704.0, + "step": 50 + }, + { + "entropy": 0.8588631230592728, + "epoch": 0.2677376171352075, + "grad_norm": 0.6621095538139343, + "learning_rate": 6.190043105507899e-05, + "loss": 0.8043325805664062, + "mean_token_accuracy": 0.7855384379625321, + "num_tokens": 256077.0, + "step": 100 + }, + { + "entropy": 0.6726470375061036, + "epoch": 0.40160642570281124, + "grad_norm": 0.5255736112594604, + "learning_rate": 9.316327502229059e-05, + "loss": 0.6415129089355469, + "mean_token_accuracy": 0.8183896672725678, + "num_tokens": 387735.0, + "step": 150 + }, + { + "entropy": 0.6119109469652176, + "epoch": 0.535475234270415, + "grad_norm": 0.45975860953330994, + "learning_rate": 0.0001244261189895022, + "loss": 0.576114501953125, + "mean_token_accuracy": 0.8376823288202285, + "num_tokens": 522202.0, + "step": 200 + }, + { + "entropy": 0.5972725109755993, + "epoch": 0.6693440428380187, + "grad_norm": 0.40055611729621887, + "learning_rate": 0.0001556889629567138, + "loss": 0.5613529205322265, + "mean_token_accuracy": 0.8418879929184914, + "num_tokens": 648663.0, + "step": 250 + }, + { + "entropy": 0.5673307004570961, + "epoch": 0.8032128514056225, + "grad_norm": 0.7427454590797424, + "learning_rate": 0.0001869518069239254, + "loss": 0.5325925827026368, + "mean_token_accuracy": 0.8488189685344696, + "num_tokens": 778245.0, + "step": 300 + }, + { + "entropy": 0.5615521620213986, + "epoch": 0.9370816599732262, + "grad_norm": 0.4017025828361511, + "learning_rate": 0.000218214650891137, + "loss": 0.522266616821289, + "mean_token_accuracy": 0.8500416606664658, + "num_tokens": 905328.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5758187392354012, + "eval_loss": 0.5664522051811218, + "eval_mean_token_accuracy": 0.8383084440231323, + "eval_num_tokens": 959743.0, + "eval_runtime": 51.7964, + "eval_samples_per_second": 30.852, + "eval_steps_per_second": 3.861, + "step": 374 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.089940080830976e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/adapter_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8818eeae60314bb5e6cd8bb9d31975488784721f --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.0094403300459725, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "q_proj", + "gate_proj", + "o_proj", + "v_proj", + "k_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/chat_template.jinja b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/tokenizer_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/trainer_state.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..de6713ff31dddbc6400cf45a32cb8ebee6249ce8 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/trainer_state.json @@ -0,0 +1,884 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 500, + "global_step": 3740, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.2357245934009553, + "epoch": 0.13386880856760375, + "grad_norm": 1.1940524578094482, + "learning_rate": 3.0637587087867373e-05, + "loss": 2.0509376525878906, + "mean_token_accuracy": 0.6194104523956776, + "num_tokens": 127704.0, + "step": 50 + }, + { + "entropy": 0.8588631230592728, + "epoch": 0.2677376171352075, + "grad_norm": 0.6621095538139343, + "learning_rate": 6.190043105507899e-05, + "loss": 0.8043325805664062, + "mean_token_accuracy": 0.7855384379625321, + "num_tokens": 256077.0, + "step": 100 + }, + { + "entropy": 0.6726470375061036, + "epoch": 0.40160642570281124, + "grad_norm": 0.5255736112594604, + "learning_rate": 9.316327502229059e-05, + "loss": 0.6415129089355469, + "mean_token_accuracy": 0.8183896672725678, + "num_tokens": 387735.0, + "step": 150 + }, + { + "entropy": 0.6119109469652176, + "epoch": 0.535475234270415, + "grad_norm": 0.45975860953330994, + "learning_rate": 0.0001244261189895022, + "loss": 0.576114501953125, + "mean_token_accuracy": 0.8376823288202285, + "num_tokens": 522202.0, + "step": 200 + }, + { + "entropy": 0.5972725109755993, + "epoch": 0.6693440428380187, + "grad_norm": 0.40055611729621887, + "learning_rate": 0.0001556889629567138, + "loss": 0.5613529205322265, + "mean_token_accuracy": 0.8418879929184914, + "num_tokens": 648663.0, + "step": 250 + }, + { + "entropy": 0.5673307004570961, + "epoch": 0.8032128514056225, + "grad_norm": 0.7427454590797424, + "learning_rate": 0.0001869518069239254, + "loss": 0.5325925827026368, + "mean_token_accuracy": 0.8488189685344696, + "num_tokens": 778245.0, + "step": 300 + }, + { + "entropy": 0.5615521620213986, + "epoch": 0.9370816599732262, + "grad_norm": 0.4017025828361511, + "learning_rate": 0.000218214650891137, + "loss": 0.522266616821289, + "mean_token_accuracy": 0.8500416606664658, + "num_tokens": 905328.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5758187392354012, + "eval_loss": 0.5664522051811218, + "eval_mean_token_accuracy": 0.8383084440231323, + "eval_num_tokens": 959743.0, + "eval_runtime": 51.7964, + "eval_samples_per_second": 30.852, + "eval_steps_per_second": 3.861, + "step": 374 + }, + { + "entropy": 0.5440343178883947, + "epoch": 1.069611780455154, + "grad_norm": 0.3084093928337097, + "learning_rate": 0.00023381424541885068, + "loss": 0.5051319122314453, + "mean_token_accuracy": 0.8554374280602041, + "num_tokens": 1022581.0, + "step": 400 + }, + { + "entropy": 0.5245272906124592, + "epoch": 1.2034805890227578, + "grad_norm": 0.3342743515968323, + "learning_rate": 0.00023355972972676628, + "loss": 0.4855318450927734, + "mean_token_accuracy": 0.8584430786967278, + "num_tokens": 1151430.0, + "step": 450 + }, + { + "entropy": 0.5234513898193837, + "epoch": 1.3373493975903614, + "grad_norm": 0.26953577995300293, + "learning_rate": 0.00023305125251804043, + "loss": 0.4811768341064453, + "mean_token_accuracy": 0.8606387570500373, + "num_tokens": 1282481.0, + "step": 500 + }, + { + "entropy": 0.5147616830468178, + "epoch": 1.4712182061579653, + "grad_norm": 0.28791534900665283, + "learning_rate": 0.0002322899209369128, + "loss": 0.4762062835693359, + "mean_token_accuracy": 0.8612849581241607, + "num_tokens": 1414166.0, + "step": 550 + }, + { + "entropy": 0.5060296922922134, + "epoch": 1.605087014725569, + "grad_norm": 0.2496163249015808, + "learning_rate": 0.0002312773926857543, + "loss": 0.4695176315307617, + "mean_token_accuracy": 0.8636157616972924, + "num_tokens": 1547902.0, + "step": 600 + }, + { + "entropy": 0.5008939932286739, + "epoch": 1.7389558232931726, + "grad_norm": 0.2329542636871338, + "learning_rate": 0.00023001587241563198, + "loss": 0.46317913055419924, + "mean_token_accuracy": 0.8652430367469788, + "num_tokens": 1678635.0, + "step": 650 + }, + { + "entropy": 0.4986082436144352, + "epoch": 1.8728246318607764, + "grad_norm": 0.2459402084350586, + "learning_rate": 0.00022850810692596235, + "loss": 0.4617066192626953, + "mean_token_accuracy": 0.8672981086373329, + "num_tokens": 1803328.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.546445109397173, + "eval_loss": 0.5307288765907288, + "eval_mean_token_accuracy": 0.8434162598848343, + "eval_num_tokens": 1919486.0, + "eval_runtime": 51.3627, + "eval_samples_per_second": 31.112, + "eval_steps_per_second": 3.894, + "step": 748 + }, + { + "entropy": 0.5069879525237613, + "epoch": 2.005354752342704, + "grad_norm": 0.21533548831939697, + "learning_rate": 0.00022675737918370628, + "loss": 0.4585062026977539, + "mean_token_accuracy": 0.865652882390552, + "num_tokens": 1925307.0, + "step": 750 + }, + { + "entropy": 0.4457989126443863, + "epoch": 2.139223560910308, + "grad_norm": 0.24705350399017334, + "learning_rate": 0.00022476750117512737, + "loss": 0.4026710891723633, + "mean_token_accuracy": 0.8774338760972022, + "num_tokens": 2055627.0, + "step": 800 + }, + { + "entropy": 0.45084999009966853, + "epoch": 2.2730923694779115, + "grad_norm": 0.26643863320350647, + "learning_rate": 0.00022254280560567822, + "loss": 0.40950340270996094, + "mean_token_accuracy": 0.8752825647592545, + "num_tokens": 2177850.0, + "step": 850 + }, + { + "entropy": 0.4516983331739903, + "epoch": 2.4069611780455156, + "grad_norm": 0.26972198486328125, + "learning_rate": 0.00022008813646608725, + "loss": 0.4115512466430664, + "mean_token_accuracy": 0.8761435833573341, + "num_tokens": 2304612.0, + "step": 900 + }, + { + "entropy": 0.45280722543597224, + "epoch": 2.540829986613119, + "grad_norm": 0.2643600106239319, + "learning_rate": 0.00021740883848518684, + "loss": 0.41181053161621095, + "mean_token_accuracy": 0.8756420350074768, + "num_tokens": 2430946.0, + "step": 950 + }, + { + "entropy": 0.4487862553447485, + "epoch": 2.674698795180723, + "grad_norm": 0.2849285304546356, + "learning_rate": 0.00021451074549244846, + "loss": 0.4094270706176758, + "mean_token_accuracy": 0.8771369129419326, + "num_tokens": 2557241.0, + "step": 1000 + }, + { + "entropy": 0.45000545382499696, + "epoch": 2.8085676037483265, + "grad_norm": 0.24081671237945557, + "learning_rate": 0.0002114001677155633, + "loss": 0.4073855972290039, + "mean_token_accuracy": 0.8779223081469536, + "num_tokens": 2692775.0, + "step": 1050 + }, + { + "entropy": 0.4576124830543995, + "epoch": 2.9424364123159306, + "grad_norm": 0.2341010719537735, + "learning_rate": 0.00020808387804072673, + "loss": 0.4154107666015625, + "mean_token_accuracy": 0.8756425747275353, + "num_tokens": 2823060.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.4812743577361107, + "eval_loss": 0.523208498954773, + "eval_mean_token_accuracy": 0.8494922530651092, + "eval_num_tokens": 2879229.0, + "eval_runtime": 51.3707, + "eval_samples_per_second": 31.107, + "eval_steps_per_second": 3.893, + "step": 1122 + }, + { + "entropy": 0.41342759042075183, + "epoch": 3.074966532797858, + "grad_norm": 0.34829744696617126, + "learning_rate": 0.0002045690972655427, + "loss": 0.3644887542724609, + "mean_token_accuracy": 0.8879408977850519, + "num_tokens": 2955424.0, + "step": 1150 + }, + { + "entropy": 0.3948360003530979, + "epoch": 3.208835341365462, + "grad_norm": 0.312427282333374, + "learning_rate": 0.00020086347837665854, + "loss": 0.34146129608154296, + "mean_token_accuracy": 0.8914449456334114, + "num_tokens": 3078799.0, + "step": 1200 + }, + { + "entropy": 0.38602679744362833, + "epoch": 3.3427041499330654, + "grad_norm": 0.2715625762939453, + "learning_rate": 0.0001969750898863629, + "loss": 0.34105979919433593, + "mean_token_accuracy": 0.8928995525836945, + "num_tokens": 3211945.0, + "step": 1250 + }, + { + "entropy": 0.3923524462431669, + "epoch": 3.4765729585006695, + "grad_norm": 0.27295640110969543, + "learning_rate": 0.00019291239826442992, + "loss": 0.3473458099365234, + "mean_token_accuracy": 0.8913829082250595, + "num_tokens": 3343933.0, + "step": 1300 + }, + { + "entropy": 0.40172914519906044, + "epoch": 3.610441767068273, + "grad_norm": 0.32684192061424255, + "learning_rate": 0.0001886842495034615, + "loss": 0.35543827056884764, + "mean_token_accuracy": 0.8904094022512435, + "num_tokens": 3470087.0, + "step": 1350 + }, + { + "entropy": 0.4010512103140354, + "epoch": 3.7443105756358768, + "grad_norm": 0.23922984302043915, + "learning_rate": 0.00018429984985786734, + "loss": 0.3535212326049805, + "mean_token_accuracy": 0.8891122484207153, + "num_tokens": 3590858.0, + "step": 1400 + }, + { + "entropy": 0.39226082623004915, + "epoch": 3.878179384203481, + "grad_norm": 0.25130993127822876, + "learning_rate": 0.00017976874579842046, + "loss": 0.3484851837158203, + "mean_token_accuracy": 0.8916165816783905, + "num_tokens": 3725806.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.45332007586956025, + "eval_loss": 0.5243425965309143, + "eval_mean_token_accuracy": 0.8532214590907097, + "eval_num_tokens": 3838972.0, + "eval_runtime": 51.3837, + "eval_samples_per_second": 31.099, + "eval_steps_per_second": 3.892, + "step": 1496 + }, + { + "entropy": 0.39358587043754983, + "epoch": 4.010709504685408, + "grad_norm": 0.27960067987442017, + "learning_rate": 0.0001751008032260355, + "loss": 0.34616813659667967, + "mean_token_accuracy": 0.8923709010234987, + "num_tokens": 3849380.0, + "step": 1500 + }, + { + "entropy": 0.3206369188427925, + "epoch": 4.144578313253012, + "grad_norm": 0.37261858582496643, + "learning_rate": 0.00017030618599002818, + "loss": 0.2684581565856934, + "mean_token_accuracy": 0.9131761506199837, + "num_tokens": 3976694.0, + "step": 1550 + }, + { + "entropy": 0.3254615054279566, + "epoch": 4.278447121820616, + "grad_norm": 0.39803799986839294, + "learning_rate": 0.00016539533375763032, + "loss": 0.2769618606567383, + "mean_token_accuracy": 0.9102409112453461, + "num_tokens": 4110204.0, + "step": 1600 + }, + { + "entropy": 0.32003962114453316, + "epoch": 4.412315930388219, + "grad_norm": 0.35707366466522217, + "learning_rate": 0.0001603789392829468, + "loss": 0.2749842834472656, + "mean_token_accuracy": 0.910626070201397, + "num_tokens": 4240883.0, + "step": 1650 + }, + { + "entropy": 0.32672152675688265, + "epoch": 4.546184738955823, + "grad_norm": 0.47052621841430664, + "learning_rate": 0.00015526792512484774, + "loss": 0.27983531951904295, + "mean_token_accuracy": 0.9093958771228791, + "num_tokens": 4365381.0, + "step": 1700 + }, + { + "entropy": 0.33449163861572745, + "epoch": 4.680053547523427, + "grad_norm": 0.330709844827652, + "learning_rate": 0.00015007341986449012, + "loss": 0.28533639907836916, + "mean_token_accuracy": 0.9082232251763344, + "num_tokens": 4490711.0, + "step": 1750 + }, + { + "entropy": 0.33353066638112067, + "epoch": 4.813922356091031, + "grad_norm": 0.3990134298801422, + "learning_rate": 0.00014480673387425272, + "loss": 0.28489078521728517, + "mean_token_accuracy": 0.908001911342144, + "num_tokens": 4618532.0, + "step": 1800 + }, + { + "entropy": 0.3272412090748549, + "epoch": 4.947791164658635, + "grad_norm": 0.3183020353317261, + "learning_rate": 0.00013947933469084315, + "loss": 0.2772365379333496, + "mean_token_accuracy": 0.908946952521801, + "num_tokens": 4752261.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.3932068006694317, + "eval_loss": 0.5647156834602356, + "eval_mean_token_accuracy": 0.85078628718853, + "eval_num_tokens": 4798715.0, + "eval_runtime": 51.3578, + "eval_samples_per_second": 31.115, + "eval_steps_per_second": 3.894, + "step": 1870 + }, + { + "entropy": 0.2832252390005372, + "epoch": 5.080321285140562, + "grad_norm": 0.3697633147239685, + "learning_rate": 0.00013410282204620014, + "loss": 0.2279021453857422, + "mean_token_accuracy": 0.9252248072262966, + "num_tokens": 4879271.0, + "step": 1900 + }, + { + "entropy": 0.250804705247283, + "epoch": 5.214190093708166, + "grad_norm": 0.3890162706375122, + "learning_rate": 0.00012868890261055722, + "loss": 0.1980854606628418, + "mean_token_accuracy": 0.9338876655697823, + "num_tokens": 5005076.0, + "step": 1950 + }, + { + "entropy": 0.2531572911888361, + "epoch": 5.34805890227577, + "grad_norm": 0.43466225266456604, + "learning_rate": 0.0001232493645026623, + "loss": 0.20114482879638673, + "mean_token_accuracy": 0.9317018255591393, + "num_tokens": 5133591.0, + "step": 2000 + }, + { + "entropy": 0.25918263107538225, + "epoch": 5.481927710843373, + "grad_norm": 0.38253673911094666, + "learning_rate": 0.00011779605162265297, + "loss": 0.2056061363220215, + "mean_token_accuracy": 0.9302830925583839, + "num_tokens": 5257252.0, + "step": 2050 + }, + { + "entropy": 0.2553627458959818, + "epoch": 5.615796519410977, + "grad_norm": 0.4536231458187103, + "learning_rate": 0.00011234083786347563, + "loss": 0.20531394958496094, + "mean_token_accuracy": 0.9302299374341965, + "num_tokens": 5388652.0, + "step": 2100 + }, + { + "entropy": 0.2575570110231638, + "epoch": 5.749665327978581, + "grad_norm": 0.36399731040000916, + "learning_rate": 0.00010689560125699833, + "loss": 0.2048162841796875, + "mean_token_accuracy": 0.9306997761130333, + "num_tokens": 5515488.0, + "step": 2150 + }, + { + "entropy": 0.24660897620022296, + "epoch": 5.883534136546185, + "grad_norm": 0.43602702021598816, + "learning_rate": 0.00010147219811111233, + "loss": 0.1986431884765625, + "mean_token_accuracy": 0.9335029146075249, + "num_tokens": 5644323.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.32280903935432437, + "eval_loss": 0.6383674144744873, + "eval_mean_token_accuracy": 0.849581449329853, + "eval_num_tokens": 5758458.0, + "eval_runtime": 51.3251, + "eval_samples_per_second": 31.135, + "eval_steps_per_second": 3.897, + "step": 2244 + }, + { + "entropy": 0.24638524448329752, + "epoch": 6.016064257028113, + "grad_norm": 0.3538878262042999, + "learning_rate": 9.608243719413435e-05, + "loss": 0.19203664779663085, + "mean_token_accuracy": 0.9353304363862432, + "num_tokens": 5773027.0, + "step": 2250 + }, + { + "entropy": 0.17814311504364014, + "epoch": 6.149933065595716, + "grad_norm": 0.3886161148548126, + "learning_rate": 9.07380540227205e-05, + "loss": 0.12442682266235351, + "mean_token_accuracy": 0.9582101872563362, + "num_tokens": 5904840.0, + "step": 2300 + }, + { + "entropy": 0.17237136442214251, + "epoch": 6.28380187416332, + "grad_norm": 0.38807374238967896, + "learning_rate": 8.545068530927622e-05, + "loss": 0.12445520401000977, + "mean_token_accuracy": 0.9580146077275277, + "num_tokens": 6037457.0, + "step": 2350 + }, + { + "entropy": 0.18334724467247723, + "epoch": 6.417670682730924, + "grad_norm": 0.48334068059921265, + "learning_rate": 8.023184362449975e-05, + "loss": 0.12853397369384767, + "mean_token_accuracy": 0.956232733130455, + "num_tokens": 6161042.0, + "step": 2400 + }, + { + "entropy": 0.17894859783351422, + "epoch": 6.551539491298527, + "grad_norm": 0.3343403935432434, + "learning_rate": 7.509289233022861e-05, + "loss": 0.12605968475341797, + "mean_token_accuracy": 0.9566894540190697, + "num_tokens": 6291748.0, + "step": 2450 + }, + { + "entropy": 0.17900108266621828, + "epoch": 6.685408299866131, + "grad_norm": 0.41615480184555054, + "learning_rate": 7.00450208371691e-05, + "loss": 0.12843725204467774, + "mean_token_accuracy": 0.956638223528862, + "num_tokens": 6419265.0, + "step": 2500 + }, + { + "entropy": 0.18341445792466401, + "epoch": 6.8192771084337345, + "grad_norm": 0.3722545802593231, + "learning_rate": 6.509922024138231e-05, + "loss": 0.13251185417175293, + "mean_token_accuracy": 0.9549383011460304, + "num_tokens": 6544758.0, + "step": 2550 + }, + { + "entropy": 0.17789534136652946, + "epoch": 6.953145917001339, + "grad_norm": 0.33068087697029114, + "learning_rate": 6.02662593925748e-05, + "loss": 0.126302547454834, + "mean_token_accuracy": 0.9568320420384407, + "num_tokens": 6674626.0, + "step": 2600 + }, + { + "epoch": 7.0, + "eval_entropy": 0.2639452085644007, + "eval_loss": 0.7510635852813721, + "eval_mean_token_accuracy": 0.8456276795268058, + "eval_num_tokens": 6718201.0, + "eval_runtime": 51.3435, + "eval_samples_per_second": 31.124, + "eval_steps_per_second": 3.895, + "step": 2618 + }, + { + "entropy": 0.15149398003187445, + "epoch": 7.085676037483267, + "grad_norm": 0.3362366855144501, + "learning_rate": 5.5556661446302733e-05, + "loss": 0.09618576049804688, + "mean_token_accuracy": 0.9674260706612559, + "num_tokens": 6803830.0, + "step": 2650 + }, + { + "entropy": 0.1279136904887855, + "epoch": 7.21954484605087, + "grad_norm": 0.2925446033477783, + "learning_rate": 5.0980680951143166e-05, + "loss": 0.07902004718780517, + "mean_token_accuracy": 0.9733691918849945, + "num_tokens": 6936289.0, + "step": 2700 + }, + { + "entropy": 0.13701909594237804, + "epoch": 7.353413654618474, + "grad_norm": 0.2659797668457031, + "learning_rate": 4.6548281520723104e-05, + "loss": 0.08250561714172364, + "mean_token_accuracy": 0.971816695034504, + "num_tokens": 7057823.0, + "step": 2750 + }, + { + "entropy": 0.12771729078143834, + "epoch": 7.4872824631860775, + "grad_norm": 0.34247028827667236, + "learning_rate": 4.2269114139222296e-05, + "loss": 0.08109721183776855, + "mean_token_accuracy": 0.9735026282072067, + "num_tokens": 7187020.0, + "step": 2800 + }, + { + "entropy": 0.12350119687616826, + "epoch": 7.621151271753681, + "grad_norm": 0.40673893690109253, + "learning_rate": 3.8152496147586614e-05, + "loss": 0.07707037448883057, + "mean_token_accuracy": 0.9743763041496277, + "num_tokens": 7323457.0, + "step": 2850 + }, + { + "entropy": 0.12642662361264229, + "epoch": 7.755020080321285, + "grad_norm": 0.3064998984336853, + "learning_rate": 3.4207390956206875e-05, + "loss": 0.07985133647918702, + "mean_token_accuracy": 0.9733496251702308, + "num_tokens": 7451765.0, + "step": 2900 + }, + { + "entropy": 0.1269074559956789, + "epoch": 7.888888888888889, + "grad_norm": 0.24833732843399048, + "learning_rate": 3.0442388528236647e-05, + "loss": 0.0821513843536377, + "mean_token_accuracy": 0.9733479696512223, + "num_tokens": 7578222.0, + "step": 2950 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2316589906811714, + "eval_loss": 0.851987898349762, + "eval_mean_token_accuracy": 0.8467304027080536, + "eval_num_tokens": 7677944.0, + "eval_runtime": 51.3501, + "eval_samples_per_second": 31.12, + "eval_steps_per_second": 3.895, + "step": 2992 + }, + { + "entropy": 0.13051860011888272, + "epoch": 8.021419009370817, + "grad_norm": 0.13740092515945435, + "learning_rate": 2.686568667604363e-05, + "loss": 0.08182425498962402, + "mean_token_accuracy": 0.9728358243450974, + "num_tokens": 7699511.0, + "step": 3000 + }, + { + "entropy": 0.11715538412332535, + "epoch": 8.15528781793842, + "grad_norm": 0.22101899981498718, + "learning_rate": 2.3485073211519044e-05, + "loss": 0.06548665523529053, + "mean_token_accuracy": 0.9781955161690712, + "num_tokens": 7822731.0, + "step": 3050 + }, + { + "entropy": 0.10994385546073318, + "epoch": 8.289156626506024, + "grad_norm": 0.18131813406944275, + "learning_rate": 2.0307908989111124e-05, + "loss": 0.06045622825622558, + "mean_token_accuracy": 0.9784942081570626, + "num_tokens": 7954237.0, + "step": 3100 + }, + { + "entropy": 0.10547435775399208, + "epoch": 8.423025435073628, + "grad_norm": 0.219278946518898, + "learning_rate": 1.734111187850385e-05, + "loss": 0.06194626808166504, + "mean_token_accuracy": 0.979062694311142, + "num_tokens": 8082999.0, + "step": 3150 + }, + { + "entropy": 0.10694314314052462, + "epoch": 8.556894243641231, + "grad_norm": 0.21804827451705933, + "learning_rate": 1.4591141701838324e-05, + "loss": 0.06162384033203125, + "mean_token_accuracy": 0.9781702619791031, + "num_tokens": 8214329.0, + "step": 3200 + }, + { + "entropy": 0.10566199742257595, + "epoch": 8.690763052208835, + "grad_norm": 0.12687279284000397, + "learning_rate": 1.2063986168274383e-05, + "loss": 0.06172010898590088, + "mean_token_accuracy": 0.9794953766465188, + "num_tokens": 8345134.0, + "step": 3250 + }, + { + "entropy": 0.11023524977266788, + "epoch": 8.824631860776439, + "grad_norm": 0.14195536077022552, + "learning_rate": 9.765147836518029e-06, + "loss": 0.06327592372894288, + "mean_token_accuracy": 0.9780235534906387, + "num_tokens": 8470788.0, + "step": 3300 + }, + { + "entropy": 0.11061601843684912, + "epoch": 8.958500669344042, + "grad_norm": 0.17863383889198303, + "learning_rate": 7.699632133701809e-06, + "loss": 0.0625003719329834, + "mean_token_accuracy": 0.9776859974861145, + "num_tokens": 8599154.0, + "step": 3350 + }, + { + "epoch": 9.0, + "eval_entropy": 0.2094542646408081, + "eval_loss": 0.9543951153755188, + "eval_mean_token_accuracy": 0.8449241068959236, + "eval_num_tokens": 8637687.0, + "eval_runtime": 51.3693, + "eval_samples_per_second": 31.108, + "eval_steps_per_second": 3.893, + "step": 3366 + }, + { + "entropy": 0.11372084063336704, + "epoch": 9.09103078982597, + "grad_norm": 0.18322895467281342, + "learning_rate": 5.871936456706078e-06, + "loss": 0.061551513671875, + "mean_token_accuracy": 0.9781497656696975, + "num_tokens": 8718657.0, + "step": 3400 + }, + { + "entropy": 0.1057378869690001, + "epoch": 9.224899598393574, + "grad_norm": 0.1794043332338333, + "learning_rate": 4.286040379651099e-06, + "loss": 0.05919742107391358, + "mean_token_accuracy": 0.979441005885601, + "num_tokens": 8841327.0, + "step": 3450 + }, + { + "entropy": 0.10552630050107836, + "epoch": 9.358768406961179, + "grad_norm": 0.2005266696214676, + "learning_rate": 2.945396988882265e-06, + "loss": 0.05768038272857666, + "mean_token_accuracy": 0.9795372131466865, + "num_tokens": 8968182.0, + "step": 3500 + }, + { + "entropy": 0.0980272913351655, + "epoch": 9.492637215528783, + "grad_norm": 0.22345173358917236, + "learning_rate": 1.8529253643150706e-06, + "loss": 0.05448314189910888, + "mean_token_accuracy": 0.9807884976267814, + "num_tokens": 9103318.0, + "step": 3550 + }, + { + "entropy": 0.10432180495932698, + "epoch": 9.626506024096386, + "grad_norm": 0.1691906750202179, + "learning_rate": 1.0110042235111828e-06, + "loss": 0.05666207790374756, + "mean_token_accuracy": 0.9797532597184181, + "num_tokens": 9233233.0, + "step": 3600 + }, + { + "entropy": 0.10123609615489841, + "epoch": 9.76037483266399, + "grad_norm": 0.21184755861759186, + "learning_rate": 4.214667423244783e-07, + "loss": 0.056021313667297366, + "mean_token_accuracy": 0.9803864064812661, + "num_tokens": 9362807.0, + "step": 3650 + }, + { + "entropy": 0.09557499976828694, + "epoch": 9.894243641231594, + "grad_norm": 0.19447794556617737, + "learning_rate": 8.559656339447186e-08, + "loss": 0.05355457782745361, + "mean_token_accuracy": 0.9813841906189918, + "num_tokens": 9500858.0, + "step": 3700 + }, + { + "epoch": 10.0, + "eval_entropy": 0.20286250963807106, + "eval_loss": 0.9991143941879272, + "eval_mean_token_accuracy": 0.8458420696854592, + "eval_num_tokens": 9597430.0, + "eval_runtime": 51.3552, + "eval_samples_per_second": 31.117, + "eval_steps_per_second": 3.894, + "step": 3740 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.115447797653412e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/adapter_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8818eeae60314bb5e6cd8bb9d31975488784721f --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.0094403300459725, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "q_proj", + "gate_proj", + "o_proj", + "v_proj", + "k_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/chat_template.jinja b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/tokenizer_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/trainer_state.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d9bf2568583ede88c8066a63b851d90d79cbbdff --- /dev/null +++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/trainer_state.json @@ -0,0 +1,196 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 748, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.2357245934009553, + "epoch": 0.13386880856760375, + "grad_norm": 1.1940524578094482, + "learning_rate": 3.0637587087867373e-05, + "loss": 2.0509376525878906, + "mean_token_accuracy": 0.6194104523956776, + "num_tokens": 127704.0, + "step": 50 + }, + { + "entropy": 0.8588631230592728, + "epoch": 0.2677376171352075, + "grad_norm": 0.6621095538139343, + "learning_rate": 6.190043105507899e-05, + "loss": 0.8043325805664062, + "mean_token_accuracy": 0.7855384379625321, + "num_tokens": 256077.0, + "step": 100 + }, + { + "entropy": 0.6726470375061036, + "epoch": 0.40160642570281124, + "grad_norm": 0.5255736112594604, + "learning_rate": 9.316327502229059e-05, + "loss": 0.6415129089355469, + "mean_token_accuracy": 0.8183896672725678, + "num_tokens": 387735.0, + "step": 150 + }, + { + "entropy": 0.6119109469652176, + "epoch": 0.535475234270415, + "grad_norm": 0.45975860953330994, + "learning_rate": 0.0001244261189895022, + "loss": 0.576114501953125, + "mean_token_accuracy": 0.8376823288202285, + "num_tokens": 522202.0, + "step": 200 + }, + { + "entropy": 0.5972725109755993, + "epoch": 0.6693440428380187, + "grad_norm": 0.40055611729621887, + "learning_rate": 0.0001556889629567138, + "loss": 0.5613529205322265, + "mean_token_accuracy": 0.8418879929184914, + "num_tokens": 648663.0, + "step": 250 + }, + { + "entropy": 0.5673307004570961, + "epoch": 0.8032128514056225, + "grad_norm": 0.7427454590797424, + "learning_rate": 0.0001869518069239254, + "loss": 0.5325925827026368, + "mean_token_accuracy": 0.8488189685344696, + "num_tokens": 778245.0, + "step": 300 + }, + { + "entropy": 0.5615521620213986, + "epoch": 0.9370816599732262, + "grad_norm": 0.4017025828361511, + "learning_rate": 0.000218214650891137, + "loss": 0.522266616821289, + "mean_token_accuracy": 0.8500416606664658, + "num_tokens": 905328.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5758187392354012, + "eval_loss": 0.5664522051811218, + "eval_mean_token_accuracy": 0.8383084440231323, + "eval_num_tokens": 959743.0, + "eval_runtime": 51.7964, + "eval_samples_per_second": 30.852, + "eval_steps_per_second": 3.861, + "step": 374 + }, + { + "entropy": 0.5440343178883947, + "epoch": 1.069611780455154, + "grad_norm": 0.3084093928337097, + "learning_rate": 0.00023381424541885068, + "loss": 0.5051319122314453, + "mean_token_accuracy": 0.8554374280602041, + "num_tokens": 1022581.0, + "step": 400 + }, + { + "entropy": 0.5245272906124592, + "epoch": 1.2034805890227578, + "grad_norm": 0.3342743515968323, + "learning_rate": 0.00023355972972676628, + "loss": 0.4855318450927734, + "mean_token_accuracy": 0.8584430786967278, + "num_tokens": 1151430.0, + "step": 450 + }, + { + "entropy": 0.5234513898193837, + "epoch": 1.3373493975903614, + "grad_norm": 0.26953577995300293, + "learning_rate": 0.00023305125251804043, + "loss": 0.4811768341064453, + "mean_token_accuracy": 0.8606387570500373, + "num_tokens": 1282481.0, + "step": 500 + }, + { + "entropy": 0.5147616830468178, + "epoch": 1.4712182061579653, + "grad_norm": 0.28791534900665283, + "learning_rate": 0.0002322899209369128, + "loss": 0.4762062835693359, + "mean_token_accuracy": 0.8612849581241607, + "num_tokens": 1414166.0, + "step": 550 + }, + { + "entropy": 0.5060296922922134, + "epoch": 1.605087014725569, + "grad_norm": 0.2496163249015808, + "learning_rate": 0.0002312773926857543, + "loss": 0.4695176315307617, + "mean_token_accuracy": 0.8636157616972924, + "num_tokens": 1547902.0, + "step": 600 + }, + { + "entropy": 0.5008939932286739, + "epoch": 1.7389558232931726, + "grad_norm": 0.2329542636871338, + "learning_rate": 0.00023001587241563198, + "loss": 0.46317913055419924, + "mean_token_accuracy": 0.8652430367469788, + "num_tokens": 1678635.0, + "step": 650 + }, + { + "entropy": 0.4986082436144352, + "epoch": 1.8728246318607764, + "grad_norm": 0.2459402084350586, + "learning_rate": 0.00022850810692596235, + "loss": 0.4617066192626953, + "mean_token_accuracy": 0.8672981086373329, + "num_tokens": 1803328.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.546445109397173, + "eval_loss": 0.5307288765907288, + "eval_mean_token_accuracy": 0.8434162598848343, + "eval_num_tokens": 1919486.0, + "eval_runtime": 51.3627, + "eval_samples_per_second": 31.112, + "eval_steps_per_second": 3.894, + "step": 748 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.21288517750313e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..201b3d4c1f1e8b143370f8952a59e885165126f5 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/README.md @@ -0,0 +1,58 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: transformers +model_name: Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1 +tags: +- generated_from_trainer +- trl +- sft +licence: license +--- + +# Model Card for Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1 + +This model is a fine-tuned version of [Qwen/Qwen3-4B-Base](https://huggingface.co/Qwen/Qwen3-4B-Base). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/katriin-kukk/Cross_lingual_morphological_generalization/runs/mjvftsw2) + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.29.0 +- Transformers: 5.5.4 +- Pytorch: 2.10.0 +- Datasets: 4.6.1 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..34b529d53848971eda3155b95c185289489ac0b8 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.04641649878824187, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a0fc99cd862def174d805ce466eee3156d96b716 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/trainer_state.json @@ -0,0 +1,297 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1155, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.0450534927845, + "epoch": 0.12987012987012986, + "grad_norm": 0.9224470853805542, + "learning_rate": 1.3970570546126444e-05, + "loss": 1.944740753173828, + "mean_token_accuracy": 0.6158743992447853, + "num_tokens": 133878.0, + "step": 50 + }, + { + "entropy": 1.1079417771101, + "epoch": 0.2597402597402597, + "grad_norm": 0.6827074289321899, + "learning_rate": 2.822625477686771e-05, + "loss": 1.0624147033691407, + "mean_token_accuracy": 0.7396554726362229, + "num_tokens": 265533.0, + "step": 100 + }, + { + "entropy": 0.8689985585212707, + "epoch": 0.38961038961038963, + "grad_norm": 0.6320164203643799, + "learning_rate": 4.248193900760899e-05, + "loss": 0.8256858825683594, + "mean_token_accuracy": 0.7825630265474319, + "num_tokens": 386480.0, + "step": 150 + }, + { + "entropy": 0.7946180325746536, + "epoch": 0.5194805194805194, + "grad_norm": 0.7246975302696228, + "learning_rate": 5.6737623238350247e-05, + "loss": 0.7475, + "mean_token_accuracy": 0.7959125518798829, + "num_tokens": 520135.0, + "step": 200 + }, + { + "entropy": 0.7919283157587051, + "epoch": 0.6493506493506493, + "grad_norm": 0.7635198831558228, + "learning_rate": 7.099330746909153e-05, + "loss": 0.737699966430664, + "mean_token_accuracy": 0.798919832110405, + "num_tokens": 640303.0, + "step": 250 + }, + { + "entropy": 0.7713775283098221, + "epoch": 0.7792207792207793, + "grad_norm": 0.5751814842224121, + "learning_rate": 8.52489916998328e-05, + "loss": 0.7186790466308594, + "mean_token_accuracy": 0.8014196193218232, + "num_tokens": 766027.0, + "step": 300 + }, + { + "entropy": 0.737273331284523, + "epoch": 0.9090909090909091, + "grad_norm": 0.4956009089946747, + "learning_rate": 9.950467593057406e-05, + "loss": 0.6904002380371094, + "mean_token_accuracy": 0.8083628410100937, + "num_tokens": 894871.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.7681069333965962, + "eval_loss": 0.7430074214935303, + "eval_mean_token_accuracy": 0.7941452006881053, + "eval_num_tokens": 981550.0, + "eval_runtime": 47.9814, + "eval_samples_per_second": 34.534, + "eval_steps_per_second": 4.335, + "step": 385 + }, + { + "entropy": 0.7167584246397019, + "epoch": 1.0389610389610389, + "grad_norm": 0.638637900352478, + "learning_rate": 0.00010976434715123926, + "loss": 0.6611510467529297, + "mean_token_accuracy": 0.8145374125242233, + "num_tokens": 1021041.0, + "step": 400 + }, + { + "entropy": 0.6893188625574111, + "epoch": 1.1688311688311688, + "grad_norm": 0.554688036441803, + "learning_rate": 0.00010967639449071182, + "loss": 0.6347718811035157, + "mean_token_accuracy": 0.8226255452632905, + "num_tokens": 1149161.0, + "step": 450 + }, + { + "entropy": 0.6728485250473022, + "epoch": 1.2987012987012987, + "grad_norm": 0.5104691386222839, + "learning_rate": 0.00010947585797076785, + "loss": 0.6233362579345703, + "mean_token_accuracy": 0.8227335858345032, + "num_tokens": 1278079.0, + "step": 500 + }, + { + "entropy": 0.6812490409612656, + "epoch": 1.4285714285714286, + "grad_norm": 0.4700304865837097, + "learning_rate": 0.00010916314964373551, + "loss": 0.6289351654052734, + "mean_token_accuracy": 0.8217820060253144, + "num_tokens": 1405326.0, + "step": 550 + }, + { + "entropy": 0.6633523851633072, + "epoch": 1.5584415584415585, + "grad_norm": 0.5094137191772461, + "learning_rate": 0.0001087389120469154, + "loss": 0.6168266296386719, + "mean_token_accuracy": 0.8232935756444931, + "num_tokens": 1532507.0, + "step": 600 + }, + { + "entropy": 0.6454597359895706, + "epoch": 1.6883116883116882, + "grad_norm": 0.40546780824661255, + "learning_rate": 0.00010820401688232725, + "loss": 0.5995024108886718, + "mean_token_accuracy": 0.8284876370429992, + "num_tokens": 1662747.0, + "step": 650 + }, + { + "entropy": 0.6700882267951965, + "epoch": 1.8181818181818183, + "grad_norm": 0.35203394293785095, + "learning_rate": 0.00010755956322558065, + "loss": 0.616350212097168, + "mean_token_accuracy": 0.8246171402931214, + "num_tokens": 1783817.0, + "step": 700 + }, + { + "entropy": 0.6587968200445176, + "epoch": 1.948051948051948, + "grad_norm": 0.5248188376426697, + "learning_rate": 0.00010680687526754984, + "loss": 0.608861198425293, + "mean_token_accuracy": 0.8265628081560135, + "num_tokens": 1914573.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.7101548368541094, + "eval_loss": 0.6869887113571167, + "eval_mean_token_accuracy": 0.8058141409777678, + "eval_num_tokens": 1963100.0, + "eval_runtime": 48.0161, + "eval_samples_per_second": 34.509, + "eval_steps_per_second": 4.332, + "step": 770 + }, + { + "entropy": 0.6272834652662277, + "epoch": 2.0779220779220777, + "grad_norm": 0.5319918990135193, + "learning_rate": 0.00010594749959349313, + "loss": 0.5719264221191406, + "mean_token_accuracy": 0.8345229256153107, + "num_tokens": 2035968.0, + "step": 800 + }, + { + "entropy": 0.599158108830452, + "epoch": 2.207792207792208, + "grad_norm": 0.4296111762523651, + "learning_rate": 0.00010498320200520744, + "loss": 0.5460208129882812, + "mean_token_accuracy": 0.8396762716770172, + "num_tokens": 2168591.0, + "step": 850 + }, + { + "entropy": 0.5994478952884674, + "epoch": 2.3376623376623376, + "grad_norm": 0.3820176422595978, + "learning_rate": 0.00010391596389274791, + "loss": 0.5483282852172852, + "mean_token_accuracy": 0.8385387778282165, + "num_tokens": 2297967.0, + "step": 900 + }, + { + "entropy": 0.5989043036103249, + "epoch": 2.4675324675324677, + "grad_norm": 0.41718003153800964, + "learning_rate": 0.00010274797816316749, + "loss": 0.543673095703125, + "mean_token_accuracy": 0.8396250855922699, + "num_tokens": 2425614.0, + "step": 950 + }, + { + "entropy": 0.6005555561184883, + "epoch": 2.5974025974025974, + "grad_norm": 0.4552708864212036, + "learning_rate": 0.00010148164473464206, + "loss": 0.5505282974243164, + "mean_token_accuracy": 0.8371219438314438, + "num_tokens": 2557421.0, + "step": 1000 + }, + { + "entropy": 0.5972397547960281, + "epoch": 2.7272727272727275, + "grad_norm": 0.45796939730644226, + "learning_rate": 0.00010011956560523972, + "loss": 0.5410661697387695, + "mean_token_accuracy": 0.8398081564903259, + "num_tokens": 2680379.0, + "step": 1050 + }, + { + "entropy": 0.5875487339496612, + "epoch": 2.857142857142857, + "grad_norm": 0.37569308280944824, + "learning_rate": 9.866453950646624e-05, + "loss": 0.537100830078125, + "mean_token_accuracy": 0.8407874500751495, + "num_tokens": 2810854.0, + "step": 1100 + }, + { + "entropy": 0.6040622130036354, + "epoch": 2.987012987012987, + "grad_norm": 0.4184563457965851, + "learning_rate": 9.711955615257278e-05, + "loss": 0.5466165924072266, + "mean_token_accuracy": 0.8411829793453216, + "num_tokens": 2932637.0, + "step": 1150 + }, + { + "epoch": 3.0, + "eval_entropy": 0.6296488522337034, + "eval_loss": 0.671941876411438, + "eval_mean_token_accuracy": 0.8121863231062889, + "eval_num_tokens": 2944650.0, + "eval_runtime": 47.9946, + "eval_samples_per_second": 34.525, + "eval_steps_per_second": 4.334, + "step": 1155 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.637823526955428e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..34b529d53848971eda3155b95c185289489ac0b8 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.04641649878824187, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5d2eb3b7496b4e7b927decbc8ca5bae01d559379 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/trainer_state.json @@ -0,0 +1,378 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 1540, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.0450534927845, + "epoch": 0.12987012987012986, + "grad_norm": 0.9224470853805542, + "learning_rate": 1.3970570546126444e-05, + "loss": 1.944740753173828, + "mean_token_accuracy": 0.6158743992447853, + "num_tokens": 133878.0, + "step": 50 + }, + { + "entropy": 1.1079417771101, + "epoch": 0.2597402597402597, + "grad_norm": 0.6827074289321899, + "learning_rate": 2.822625477686771e-05, + "loss": 1.0624147033691407, + "mean_token_accuracy": 0.7396554726362229, + "num_tokens": 265533.0, + "step": 100 + }, + { + "entropy": 0.8689985585212707, + "epoch": 0.38961038961038963, + "grad_norm": 0.6320164203643799, + "learning_rate": 4.248193900760899e-05, + "loss": 0.8256858825683594, + "mean_token_accuracy": 0.7825630265474319, + "num_tokens": 386480.0, + "step": 150 + }, + { + "entropy": 0.7946180325746536, + "epoch": 0.5194805194805194, + "grad_norm": 0.7246975302696228, + "learning_rate": 5.6737623238350247e-05, + "loss": 0.7475, + "mean_token_accuracy": 0.7959125518798829, + "num_tokens": 520135.0, + "step": 200 + }, + { + "entropy": 0.7919283157587051, + "epoch": 0.6493506493506493, + "grad_norm": 0.7635198831558228, + "learning_rate": 7.099330746909153e-05, + "loss": 0.737699966430664, + "mean_token_accuracy": 0.798919832110405, + "num_tokens": 640303.0, + "step": 250 + }, + { + "entropy": 0.7713775283098221, + "epoch": 0.7792207792207793, + "grad_norm": 0.5751814842224121, + "learning_rate": 8.52489916998328e-05, + "loss": 0.7186790466308594, + "mean_token_accuracy": 0.8014196193218232, + "num_tokens": 766027.0, + "step": 300 + }, + { + "entropy": 0.737273331284523, + "epoch": 0.9090909090909091, + "grad_norm": 0.4956009089946747, + "learning_rate": 9.950467593057406e-05, + "loss": 0.6904002380371094, + "mean_token_accuracy": 0.8083628410100937, + "num_tokens": 894871.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.7681069333965962, + "eval_loss": 0.7430074214935303, + "eval_mean_token_accuracy": 0.7941452006881053, + "eval_num_tokens": 981550.0, + "eval_runtime": 47.9814, + "eval_samples_per_second": 34.534, + "eval_steps_per_second": 4.335, + "step": 385 + }, + { + "entropy": 0.7167584246397019, + "epoch": 1.0389610389610389, + "grad_norm": 0.638637900352478, + "learning_rate": 0.00010976434715123926, + "loss": 0.6611510467529297, + "mean_token_accuracy": 0.8145374125242233, + "num_tokens": 1021041.0, + "step": 400 + }, + { + "entropy": 0.6893188625574111, + "epoch": 1.1688311688311688, + "grad_norm": 0.554688036441803, + "learning_rate": 0.00010967639449071182, + "loss": 0.6347718811035157, + "mean_token_accuracy": 0.8226255452632905, + "num_tokens": 1149161.0, + "step": 450 + }, + { + "entropy": 0.6728485250473022, + "epoch": 1.2987012987012987, + "grad_norm": 0.5104691386222839, + "learning_rate": 0.00010947585797076785, + "loss": 0.6233362579345703, + "mean_token_accuracy": 0.8227335858345032, + "num_tokens": 1278079.0, + "step": 500 + }, + { + "entropy": 0.6812490409612656, + "epoch": 1.4285714285714286, + "grad_norm": 0.4700304865837097, + "learning_rate": 0.00010916314964373551, + "loss": 0.6289351654052734, + "mean_token_accuracy": 0.8217820060253144, + "num_tokens": 1405326.0, + "step": 550 + }, + { + "entropy": 0.6633523851633072, + "epoch": 1.5584415584415585, + "grad_norm": 0.5094137191772461, + "learning_rate": 0.0001087389120469154, + "loss": 0.6168266296386719, + "mean_token_accuracy": 0.8232935756444931, + "num_tokens": 1532507.0, + "step": 600 + }, + { + "entropy": 0.6454597359895706, + "epoch": 1.6883116883116882, + "grad_norm": 0.40546780824661255, + "learning_rate": 0.00010820401688232725, + "loss": 0.5995024108886718, + "mean_token_accuracy": 0.8284876370429992, + "num_tokens": 1662747.0, + "step": 650 + }, + { + "entropy": 0.6700882267951965, + "epoch": 1.8181818181818183, + "grad_norm": 0.35203394293785095, + "learning_rate": 0.00010755956322558065, + "loss": 0.616350212097168, + "mean_token_accuracy": 0.8246171402931214, + "num_tokens": 1783817.0, + "step": 700 + }, + { + "entropy": 0.6587968200445176, + "epoch": 1.948051948051948, + "grad_norm": 0.5248188376426697, + "learning_rate": 0.00010680687526754984, + "loss": 0.608861198425293, + "mean_token_accuracy": 0.8265628081560135, + "num_tokens": 1914573.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.7101548368541094, + "eval_loss": 0.6869887113571167, + "eval_mean_token_accuracy": 0.8058141409777678, + "eval_num_tokens": 1963100.0, + "eval_runtime": 48.0161, + "eval_samples_per_second": 34.509, + "eval_steps_per_second": 4.332, + "step": 770 + }, + { + "entropy": 0.6272834652662277, + "epoch": 2.0779220779220777, + "grad_norm": 0.5319918990135193, + "learning_rate": 0.00010594749959349313, + "loss": 0.5719264221191406, + "mean_token_accuracy": 0.8345229256153107, + "num_tokens": 2035968.0, + "step": 800 + }, + { + "entropy": 0.599158108830452, + "epoch": 2.207792207792208, + "grad_norm": 0.4296111762523651, + "learning_rate": 0.00010498320200520744, + "loss": 0.5460208129882812, + "mean_token_accuracy": 0.8396762716770172, + "num_tokens": 2168591.0, + "step": 850 + }, + { + "entropy": 0.5994478952884674, + "epoch": 2.3376623376623376, + "grad_norm": 0.3820176422595978, + "learning_rate": 0.00010391596389274791, + "loss": 0.5483282852172852, + "mean_token_accuracy": 0.8385387778282165, + "num_tokens": 2297967.0, + "step": 900 + }, + { + "entropy": 0.5989043036103249, + "epoch": 2.4675324675324677, + "grad_norm": 0.41718003153800964, + "learning_rate": 0.00010274797816316749, + "loss": 0.543673095703125, + "mean_token_accuracy": 0.8396250855922699, + "num_tokens": 2425614.0, + "step": 950 + }, + { + "entropy": 0.6005555561184883, + "epoch": 2.5974025974025974, + "grad_norm": 0.4552708864212036, + "learning_rate": 0.00010148164473464206, + "loss": 0.5505282974243164, + "mean_token_accuracy": 0.8371219438314438, + "num_tokens": 2557421.0, + "step": 1000 + }, + { + "entropy": 0.5972397547960281, + "epoch": 2.7272727272727275, + "grad_norm": 0.45796939730644226, + "learning_rate": 0.00010011956560523972, + "loss": 0.5410661697387695, + "mean_token_accuracy": 0.8398081564903259, + "num_tokens": 2680379.0, + "step": 1050 + }, + { + "entropy": 0.5875487339496612, + "epoch": 2.857142857142857, + "grad_norm": 0.37569308280944824, + "learning_rate": 9.866453950646624e-05, + "loss": 0.537100830078125, + "mean_token_accuracy": 0.8407874500751495, + "num_tokens": 2810854.0, + "step": 1100 + }, + { + "entropy": 0.6040622130036354, + "epoch": 2.987012987012987, + "grad_norm": 0.4184563457965851, + "learning_rate": 9.711955615257278e-05, + "loss": 0.5466165924072266, + "mean_token_accuracy": 0.8411829793453216, + "num_tokens": 2932637.0, + "step": 1150 + }, + { + "epoch": 3.0, + "eval_entropy": 0.6296488522337034, + "eval_loss": 0.671941876411438, + "eval_mean_token_accuracy": 0.8121863231062889, + "eval_num_tokens": 2944650.0, + "eval_runtime": 47.9946, + "eval_samples_per_second": 34.525, + "eval_steps_per_second": 4.334, + "step": 1155 + }, + { + "entropy": 0.543001911342144, + "epoch": 3.116883116883117, + "grad_norm": 0.39842459559440613, + "learning_rate": 9.548779009744178e-05, + "loss": 0.4850382995605469, + "mean_token_accuracy": 0.8531740349531174, + "num_tokens": 3054046.0, + "step": 1200 + }, + { + "entropy": 0.5417884975671768, + "epoch": 3.2467532467532467, + "grad_norm": 0.529087245464325, + "learning_rate": 9.37725942116738e-05, + "loss": 0.48398651123046876, + "mean_token_accuracy": 0.8532969230413436, + "num_tokens": 3182515.0, + "step": 1250 + }, + { + "entropy": 0.5348580208420753, + "epoch": 3.3766233766233764, + "grad_norm": 0.4483303129673004, + "learning_rate": 9.197749279327802e-05, + "loss": 0.4842509078979492, + "mean_token_accuracy": 0.8533288407325744, + "num_tokens": 3313672.0, + "step": 1300 + }, + { + "entropy": 0.5422027057409287, + "epoch": 3.5064935064935066, + "grad_norm": 0.448234885931015, + "learning_rate": 9.010617432612243e-05, + "loss": 0.48615737915039064, + "mean_token_accuracy": 0.8517620968818664, + "num_tokens": 3440689.0, + "step": 1350 + }, + { + "entropy": 0.5518654137849808, + "epoch": 3.6363636363636362, + "grad_norm": 0.5707411170005798, + "learning_rate": 8.816248390102322e-05, + "loss": 0.4946014404296875, + "mean_token_accuracy": 0.8510974669456481, + "num_tokens": 3566427.0, + "step": 1400 + }, + { + "entropy": 0.5268254142999649, + "epoch": 3.7662337662337664, + "grad_norm": 0.42708849906921387, + "learning_rate": 8.615041531504609e-05, + "loss": 0.474882926940918, + "mean_token_accuracy": 0.8555294382572174, + "num_tokens": 3699976.0, + "step": 1450 + }, + { + "entropy": 0.5430487287044525, + "epoch": 3.896103896103896, + "grad_norm": 0.5479393005371094, + "learning_rate": 8.407410286525337e-05, + "loss": 0.48806171417236327, + "mean_token_accuracy": 0.8523763221502304, + "num_tokens": 3827926.0, + "step": 1500 + }, + { + "epoch": 4.0, + "eval_entropy": 0.6002918778417202, + "eval_loss": 0.6801126003265381, + "eval_mean_token_accuracy": 0.8130095520844827, + "eval_num_tokens": 3926200.0, + "eval_runtime": 47.9844, + "eval_samples_per_second": 34.532, + "eval_steps_per_second": 4.335, + "step": 1540 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.1769399455551386e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..34b529d53848971eda3155b95c185289489ac0b8 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.04641649878824187, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..73ba7a105468339427e9f7fa854426093084ee1a --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/trainer_state.json @@ -0,0 +1,469 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 1925, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.0450534927845, + "epoch": 0.12987012987012986, + "grad_norm": 0.9224470853805542, + "learning_rate": 1.3970570546126444e-05, + "loss": 1.944740753173828, + "mean_token_accuracy": 0.6158743992447853, + "num_tokens": 133878.0, + "step": 50 + }, + { + "entropy": 1.1079417771101, + "epoch": 0.2597402597402597, + "grad_norm": 0.6827074289321899, + "learning_rate": 2.822625477686771e-05, + "loss": 1.0624147033691407, + "mean_token_accuracy": 0.7396554726362229, + "num_tokens": 265533.0, + "step": 100 + }, + { + "entropy": 0.8689985585212707, + "epoch": 0.38961038961038963, + "grad_norm": 0.6320164203643799, + "learning_rate": 4.248193900760899e-05, + "loss": 0.8256858825683594, + "mean_token_accuracy": 0.7825630265474319, + "num_tokens": 386480.0, + "step": 150 + }, + { + "entropy": 0.7946180325746536, + "epoch": 0.5194805194805194, + "grad_norm": 0.7246975302696228, + "learning_rate": 5.6737623238350247e-05, + "loss": 0.7475, + "mean_token_accuracy": 0.7959125518798829, + "num_tokens": 520135.0, + "step": 200 + }, + { + "entropy": 0.7919283157587051, + "epoch": 0.6493506493506493, + "grad_norm": 0.7635198831558228, + "learning_rate": 7.099330746909153e-05, + "loss": 0.737699966430664, + "mean_token_accuracy": 0.798919832110405, + "num_tokens": 640303.0, + "step": 250 + }, + { + "entropy": 0.7713775283098221, + "epoch": 0.7792207792207793, + "grad_norm": 0.5751814842224121, + "learning_rate": 8.52489916998328e-05, + "loss": 0.7186790466308594, + "mean_token_accuracy": 0.8014196193218232, + "num_tokens": 766027.0, + "step": 300 + }, + { + "entropy": 0.737273331284523, + "epoch": 0.9090909090909091, + "grad_norm": 0.4956009089946747, + "learning_rate": 9.950467593057406e-05, + "loss": 0.6904002380371094, + "mean_token_accuracy": 0.8083628410100937, + "num_tokens": 894871.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.7681069333965962, + "eval_loss": 0.7430074214935303, + "eval_mean_token_accuracy": 0.7941452006881053, + "eval_num_tokens": 981550.0, + "eval_runtime": 47.9814, + "eval_samples_per_second": 34.534, + "eval_steps_per_second": 4.335, + "step": 385 + }, + { + "entropy": 0.7167584246397019, + "epoch": 1.0389610389610389, + "grad_norm": 0.638637900352478, + "learning_rate": 0.00010976434715123926, + "loss": 0.6611510467529297, + "mean_token_accuracy": 0.8145374125242233, + "num_tokens": 1021041.0, + "step": 400 + }, + { + "entropy": 0.6893188625574111, + "epoch": 1.1688311688311688, + "grad_norm": 0.554688036441803, + "learning_rate": 0.00010967639449071182, + "loss": 0.6347718811035157, + "mean_token_accuracy": 0.8226255452632905, + "num_tokens": 1149161.0, + "step": 450 + }, + { + "entropy": 0.6728485250473022, + "epoch": 1.2987012987012987, + "grad_norm": 0.5104691386222839, + "learning_rate": 0.00010947585797076785, + "loss": 0.6233362579345703, + "mean_token_accuracy": 0.8227335858345032, + "num_tokens": 1278079.0, + "step": 500 + }, + { + "entropy": 0.6812490409612656, + "epoch": 1.4285714285714286, + "grad_norm": 0.4700304865837097, + "learning_rate": 0.00010916314964373551, + "loss": 0.6289351654052734, + "mean_token_accuracy": 0.8217820060253144, + "num_tokens": 1405326.0, + "step": 550 + }, + { + "entropy": 0.6633523851633072, + "epoch": 1.5584415584415585, + "grad_norm": 0.5094137191772461, + "learning_rate": 0.0001087389120469154, + "loss": 0.6168266296386719, + "mean_token_accuracy": 0.8232935756444931, + "num_tokens": 1532507.0, + "step": 600 + }, + { + "entropy": 0.6454597359895706, + "epoch": 1.6883116883116882, + "grad_norm": 0.40546780824661255, + "learning_rate": 0.00010820401688232725, + "loss": 0.5995024108886718, + "mean_token_accuracy": 0.8284876370429992, + "num_tokens": 1662747.0, + "step": 650 + }, + { + "entropy": 0.6700882267951965, + "epoch": 1.8181818181818183, + "grad_norm": 0.35203394293785095, + "learning_rate": 0.00010755956322558065, + "loss": 0.616350212097168, + "mean_token_accuracy": 0.8246171402931214, + "num_tokens": 1783817.0, + "step": 700 + }, + { + "entropy": 0.6587968200445176, + "epoch": 1.948051948051948, + "grad_norm": 0.5248188376426697, + "learning_rate": 0.00010680687526754984, + "loss": 0.608861198425293, + "mean_token_accuracy": 0.8265628081560135, + "num_tokens": 1914573.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.7101548368541094, + "eval_loss": 0.6869887113571167, + "eval_mean_token_accuracy": 0.8058141409777678, + "eval_num_tokens": 1963100.0, + "eval_runtime": 48.0161, + "eval_samples_per_second": 34.509, + "eval_steps_per_second": 4.332, + "step": 770 + }, + { + "entropy": 0.6272834652662277, + "epoch": 2.0779220779220777, + "grad_norm": 0.5319918990135193, + "learning_rate": 0.00010594749959349313, + "loss": 0.5719264221191406, + "mean_token_accuracy": 0.8345229256153107, + "num_tokens": 2035968.0, + "step": 800 + }, + { + "entropy": 0.599158108830452, + "epoch": 2.207792207792208, + "grad_norm": 0.4296111762523651, + "learning_rate": 0.00010498320200520744, + "loss": 0.5460208129882812, + "mean_token_accuracy": 0.8396762716770172, + "num_tokens": 2168591.0, + "step": 850 + }, + { + "entropy": 0.5994478952884674, + "epoch": 2.3376623376623376, + "grad_norm": 0.3820176422595978, + "learning_rate": 0.00010391596389274791, + "loss": 0.5483282852172852, + "mean_token_accuracy": 0.8385387778282165, + "num_tokens": 2297967.0, + "step": 900 + }, + { + "entropy": 0.5989043036103249, + "epoch": 2.4675324675324677, + "grad_norm": 0.41718003153800964, + "learning_rate": 0.00010274797816316749, + "loss": 0.543673095703125, + "mean_token_accuracy": 0.8396250855922699, + "num_tokens": 2425614.0, + "step": 950 + }, + { + "entropy": 0.6005555561184883, + "epoch": 2.5974025974025974, + "grad_norm": 0.4552708864212036, + "learning_rate": 0.00010148164473464206, + "loss": 0.5505282974243164, + "mean_token_accuracy": 0.8371219438314438, + "num_tokens": 2557421.0, + "step": 1000 + }, + { + "entropy": 0.5972397547960281, + "epoch": 2.7272727272727275, + "grad_norm": 0.45796939730644226, + "learning_rate": 0.00010011956560523972, + "loss": 0.5410661697387695, + "mean_token_accuracy": 0.8398081564903259, + "num_tokens": 2680379.0, + "step": 1050 + }, + { + "entropy": 0.5875487339496612, + "epoch": 2.857142857142857, + "grad_norm": 0.37569308280944824, + "learning_rate": 9.866453950646624e-05, + "loss": 0.537100830078125, + "mean_token_accuracy": 0.8407874500751495, + "num_tokens": 2810854.0, + "step": 1100 + }, + { + "entropy": 0.6040622130036354, + "epoch": 2.987012987012987, + "grad_norm": 0.4184563457965851, + "learning_rate": 9.711955615257278e-05, + "loss": 0.5466165924072266, + "mean_token_accuracy": 0.8411829793453216, + "num_tokens": 2932637.0, + "step": 1150 + }, + { + "epoch": 3.0, + "eval_entropy": 0.6296488522337034, + "eval_loss": 0.671941876411438, + "eval_mean_token_accuracy": 0.8121863231062889, + "eval_num_tokens": 2944650.0, + "eval_runtime": 47.9946, + "eval_samples_per_second": 34.525, + "eval_steps_per_second": 4.334, + "step": 1155 + }, + { + "entropy": 0.543001911342144, + "epoch": 3.116883116883117, + "grad_norm": 0.39842459559440613, + "learning_rate": 9.548779009744178e-05, + "loss": 0.4850382995605469, + "mean_token_accuracy": 0.8531740349531174, + "num_tokens": 3054046.0, + "step": 1200 + }, + { + "entropy": 0.5417884975671768, + "epoch": 3.2467532467532467, + "grad_norm": 0.529087245464325, + "learning_rate": 9.37725942116738e-05, + "loss": 0.48398651123046876, + "mean_token_accuracy": 0.8532969230413436, + "num_tokens": 3182515.0, + "step": 1250 + }, + { + "entropy": 0.5348580208420753, + "epoch": 3.3766233766233764, + "grad_norm": 0.4483303129673004, + "learning_rate": 9.197749279327802e-05, + "loss": 0.4842509078979492, + "mean_token_accuracy": 0.8533288407325744, + "num_tokens": 3313672.0, + "step": 1300 + }, + { + "entropy": 0.5422027057409287, + "epoch": 3.5064935064935066, + "grad_norm": 0.448234885931015, + "learning_rate": 9.010617432612243e-05, + "loss": 0.48615737915039064, + "mean_token_accuracy": 0.8517620968818664, + "num_tokens": 3440689.0, + "step": 1350 + }, + { + "entropy": 0.5518654137849808, + "epoch": 3.6363636363636362, + "grad_norm": 0.5707411170005798, + "learning_rate": 8.816248390102322e-05, + "loss": 0.4946014404296875, + "mean_token_accuracy": 0.8510974669456481, + "num_tokens": 3566427.0, + "step": 1400 + }, + { + "entropy": 0.5268254142999649, + "epoch": 3.7662337662337664, + "grad_norm": 0.42708849906921387, + "learning_rate": 8.615041531504609e-05, + "loss": 0.474882926940918, + "mean_token_accuracy": 0.8555294382572174, + "num_tokens": 3699976.0, + "step": 1450 + }, + { + "entropy": 0.5430487287044525, + "epoch": 3.896103896103896, + "grad_norm": 0.5479393005371094, + "learning_rate": 8.407410286525337e-05, + "loss": 0.48806171417236327, + "mean_token_accuracy": 0.8523763221502304, + "num_tokens": 3827926.0, + "step": 1500 + }, + { + "epoch": 4.0, + "eval_entropy": 0.6002918778417202, + "eval_loss": 0.6801126003265381, + "eval_mean_token_accuracy": 0.8130095520844827, + "eval_num_tokens": 3926200.0, + "eval_runtime": 47.9844, + "eval_samples_per_second": 34.532, + "eval_steps_per_second": 4.335, + "step": 1540 + }, + { + "entropy": 0.5307527387142181, + "epoch": 4.025974025974026, + "grad_norm": 0.5444459915161133, + "learning_rate": 8.193781285375899e-05, + "loss": 0.4741718292236328, + "mean_token_accuracy": 0.8564361107349395, + "num_tokens": 3953109.0, + "step": 1550 + }, + { + "entropy": 0.47564762085676193, + "epoch": 4.1558441558441555, + "grad_norm": 0.6210038065910339, + "learning_rate": 7.974593482154601e-05, + "loss": 0.41448020935058594, + "mean_token_accuracy": 0.8706729990243912, + "num_tokens": 4081079.0, + "step": 1600 + }, + { + "entropy": 0.4731892004609108, + "epoch": 4.285714285714286, + "grad_norm": 0.4910389482975006, + "learning_rate": 7.750297252905916e-05, + "loss": 0.4135689163208008, + "mean_token_accuracy": 0.8708927237987518, + "num_tokens": 4206426.0, + "step": 1650 + }, + { + "entropy": 0.47941224902868274, + "epoch": 4.415584415584416, + "grad_norm": 0.4900703430175781, + "learning_rate": 7.521353470210501e-05, + "loss": 0.4219230270385742, + "mean_token_accuracy": 0.8685608941316605, + "num_tokens": 4329839.0, + "step": 1700 + }, + { + "entropy": 0.48145264506340024, + "epoch": 4.545454545454545, + "grad_norm": 0.4503515064716339, + "learning_rate": 7.288232556207461e-05, + "loss": 0.4248290252685547, + "mean_token_accuracy": 0.8680109107494354, + "num_tokens": 4453940.0, + "step": 1750 + }, + { + "entropy": 0.47785325974226, + "epoch": 4.675324675324675, + "grad_norm": 0.5630834698677063, + "learning_rate": 7.051413515994661e-05, + "loss": 0.4244534683227539, + "mean_token_accuracy": 0.8692984575033188, + "num_tokens": 4583626.0, + "step": 1800 + }, + { + "entropy": 0.4738149631023407, + "epoch": 4.805194805194805, + "grad_norm": 0.5809922814369202, + "learning_rate": 6.811382953393207e-05, + "loss": 0.41768589019775393, + "mean_token_accuracy": 0.870254020690918, + "num_tokens": 4713540.0, + "step": 1850 + }, + { + "entropy": 0.47820780247449873, + "epoch": 4.935064935064935, + "grad_norm": 0.5658873915672302, + "learning_rate": 6.56863407109845e-05, + "loss": 0.42182437896728514, + "mean_token_accuracy": 0.8680490332841874, + "num_tokens": 4845678.0, + "step": 1900 + }, + { + "epoch": 5.0, + "eval_entropy": 0.5540635969776374, + "eval_loss": 0.6924836039543152, + "eval_mean_token_accuracy": 0.8140181821699326, + "eval_num_tokens": 4907750.0, + "eval_runtime": 47.9926, + "eval_samples_per_second": 34.526, + "eval_steps_per_second": 4.334, + "step": 1925 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.723982041528156e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..34b529d53848971eda3155b95c185289489ac0b8 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.04641649878824187, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..905899b993df934967865bd125d25b1fa1402b2e --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/trainer_state.json @@ -0,0 +1,560 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.0, + "eval_steps": 500, + "global_step": 2310, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.0450534927845, + "epoch": 0.12987012987012986, + "grad_norm": 0.9224470853805542, + "learning_rate": 1.3970570546126444e-05, + "loss": 1.944740753173828, + "mean_token_accuracy": 0.6158743992447853, + "num_tokens": 133878.0, + "step": 50 + }, + { + "entropy": 1.1079417771101, + "epoch": 0.2597402597402597, + "grad_norm": 0.6827074289321899, + "learning_rate": 2.822625477686771e-05, + "loss": 1.0624147033691407, + "mean_token_accuracy": 0.7396554726362229, + "num_tokens": 265533.0, + "step": 100 + }, + { + "entropy": 0.8689985585212707, + "epoch": 0.38961038961038963, + "grad_norm": 0.6320164203643799, + "learning_rate": 4.248193900760899e-05, + "loss": 0.8256858825683594, + "mean_token_accuracy": 0.7825630265474319, + "num_tokens": 386480.0, + "step": 150 + }, + { + "entropy": 0.7946180325746536, + "epoch": 0.5194805194805194, + "grad_norm": 0.7246975302696228, + "learning_rate": 5.6737623238350247e-05, + "loss": 0.7475, + "mean_token_accuracy": 0.7959125518798829, + "num_tokens": 520135.0, + "step": 200 + }, + { + "entropy": 0.7919283157587051, + "epoch": 0.6493506493506493, + "grad_norm": 0.7635198831558228, + "learning_rate": 7.099330746909153e-05, + "loss": 0.737699966430664, + "mean_token_accuracy": 0.798919832110405, + "num_tokens": 640303.0, + "step": 250 + }, + { + "entropy": 0.7713775283098221, + "epoch": 0.7792207792207793, + "grad_norm": 0.5751814842224121, + "learning_rate": 8.52489916998328e-05, + "loss": 0.7186790466308594, + "mean_token_accuracy": 0.8014196193218232, + "num_tokens": 766027.0, + "step": 300 + }, + { + "entropy": 0.737273331284523, + "epoch": 0.9090909090909091, + "grad_norm": 0.4956009089946747, + "learning_rate": 9.950467593057406e-05, + "loss": 0.6904002380371094, + "mean_token_accuracy": 0.8083628410100937, + "num_tokens": 894871.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.7681069333965962, + "eval_loss": 0.7430074214935303, + "eval_mean_token_accuracy": 0.7941452006881053, + "eval_num_tokens": 981550.0, + "eval_runtime": 47.9814, + "eval_samples_per_second": 34.534, + "eval_steps_per_second": 4.335, + "step": 385 + }, + { + "entropy": 0.7167584246397019, + "epoch": 1.0389610389610389, + "grad_norm": 0.638637900352478, + "learning_rate": 0.00010976434715123926, + "loss": 0.6611510467529297, + "mean_token_accuracy": 0.8145374125242233, + "num_tokens": 1021041.0, + "step": 400 + }, + { + "entropy": 0.6893188625574111, + "epoch": 1.1688311688311688, + "grad_norm": 0.554688036441803, + "learning_rate": 0.00010967639449071182, + "loss": 0.6347718811035157, + "mean_token_accuracy": 0.8226255452632905, + "num_tokens": 1149161.0, + "step": 450 + }, + { + "entropy": 0.6728485250473022, + "epoch": 1.2987012987012987, + "grad_norm": 0.5104691386222839, + "learning_rate": 0.00010947585797076785, + "loss": 0.6233362579345703, + "mean_token_accuracy": 0.8227335858345032, + "num_tokens": 1278079.0, + "step": 500 + }, + { + "entropy": 0.6812490409612656, + "epoch": 1.4285714285714286, + "grad_norm": 0.4700304865837097, + "learning_rate": 0.00010916314964373551, + "loss": 0.6289351654052734, + "mean_token_accuracy": 0.8217820060253144, + "num_tokens": 1405326.0, + "step": 550 + }, + { + "entropy": 0.6633523851633072, + "epoch": 1.5584415584415585, + "grad_norm": 0.5094137191772461, + "learning_rate": 0.0001087389120469154, + "loss": 0.6168266296386719, + "mean_token_accuracy": 0.8232935756444931, + "num_tokens": 1532507.0, + "step": 600 + }, + { + "entropy": 0.6454597359895706, + "epoch": 1.6883116883116882, + "grad_norm": 0.40546780824661255, + "learning_rate": 0.00010820401688232725, + "loss": 0.5995024108886718, + "mean_token_accuracy": 0.8284876370429992, + "num_tokens": 1662747.0, + "step": 650 + }, + { + "entropy": 0.6700882267951965, + "epoch": 1.8181818181818183, + "grad_norm": 0.35203394293785095, + "learning_rate": 0.00010755956322558065, + "loss": 0.616350212097168, + "mean_token_accuracy": 0.8246171402931214, + "num_tokens": 1783817.0, + "step": 700 + }, + { + "entropy": 0.6587968200445176, + "epoch": 1.948051948051948, + "grad_norm": 0.5248188376426697, + "learning_rate": 0.00010680687526754984, + "loss": 0.608861198425293, + "mean_token_accuracy": 0.8265628081560135, + "num_tokens": 1914573.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.7101548368541094, + "eval_loss": 0.6869887113571167, + "eval_mean_token_accuracy": 0.8058141409777678, + "eval_num_tokens": 1963100.0, + "eval_runtime": 48.0161, + "eval_samples_per_second": 34.509, + "eval_steps_per_second": 4.332, + "step": 770 + }, + { + "entropy": 0.6272834652662277, + "epoch": 2.0779220779220777, + "grad_norm": 0.5319918990135193, + "learning_rate": 0.00010594749959349313, + "loss": 0.5719264221191406, + "mean_token_accuracy": 0.8345229256153107, + "num_tokens": 2035968.0, + "step": 800 + }, + { + "entropy": 0.599158108830452, + "epoch": 2.207792207792208, + "grad_norm": 0.4296111762523651, + "learning_rate": 0.00010498320200520744, + "loss": 0.5460208129882812, + "mean_token_accuracy": 0.8396762716770172, + "num_tokens": 2168591.0, + "step": 850 + }, + { + "entropy": 0.5994478952884674, + "epoch": 2.3376623376623376, + "grad_norm": 0.3820176422595978, + "learning_rate": 0.00010391596389274791, + "loss": 0.5483282852172852, + "mean_token_accuracy": 0.8385387778282165, + "num_tokens": 2297967.0, + "step": 900 + }, + { + "entropy": 0.5989043036103249, + "epoch": 2.4675324675324677, + "grad_norm": 0.41718003153800964, + "learning_rate": 0.00010274797816316749, + "loss": 0.543673095703125, + "mean_token_accuracy": 0.8396250855922699, + "num_tokens": 2425614.0, + "step": 950 + }, + { + "entropy": 0.6005555561184883, + "epoch": 2.5974025974025974, + "grad_norm": 0.4552708864212036, + "learning_rate": 0.00010148164473464206, + "loss": 0.5505282974243164, + "mean_token_accuracy": 0.8371219438314438, + "num_tokens": 2557421.0, + "step": 1000 + }, + { + "entropy": 0.5972397547960281, + "epoch": 2.7272727272727275, + "grad_norm": 0.45796939730644226, + "learning_rate": 0.00010011956560523972, + "loss": 0.5410661697387695, + "mean_token_accuracy": 0.8398081564903259, + "num_tokens": 2680379.0, + "step": 1050 + }, + { + "entropy": 0.5875487339496612, + "epoch": 2.857142857142857, + "grad_norm": 0.37569308280944824, + "learning_rate": 9.866453950646624e-05, + "loss": 0.537100830078125, + "mean_token_accuracy": 0.8407874500751495, + "num_tokens": 2810854.0, + "step": 1100 + }, + { + "entropy": 0.6040622130036354, + "epoch": 2.987012987012987, + "grad_norm": 0.4184563457965851, + "learning_rate": 9.711955615257278e-05, + "loss": 0.5466165924072266, + "mean_token_accuracy": 0.8411829793453216, + "num_tokens": 2932637.0, + "step": 1150 + }, + { + "epoch": 3.0, + "eval_entropy": 0.6296488522337034, + "eval_loss": 0.671941876411438, + "eval_mean_token_accuracy": 0.8121863231062889, + "eval_num_tokens": 2944650.0, + "eval_runtime": 47.9946, + "eval_samples_per_second": 34.525, + "eval_steps_per_second": 4.334, + "step": 1155 + }, + { + "entropy": 0.543001911342144, + "epoch": 3.116883116883117, + "grad_norm": 0.39842459559440613, + "learning_rate": 9.548779009744178e-05, + "loss": 0.4850382995605469, + "mean_token_accuracy": 0.8531740349531174, + "num_tokens": 3054046.0, + "step": 1200 + }, + { + "entropy": 0.5417884975671768, + "epoch": 3.2467532467532467, + "grad_norm": 0.529087245464325, + "learning_rate": 9.37725942116738e-05, + "loss": 0.48398651123046876, + "mean_token_accuracy": 0.8532969230413436, + "num_tokens": 3182515.0, + "step": 1250 + }, + { + "entropy": 0.5348580208420753, + "epoch": 3.3766233766233764, + "grad_norm": 0.4483303129673004, + "learning_rate": 9.197749279327802e-05, + "loss": 0.4842509078979492, + "mean_token_accuracy": 0.8533288407325744, + "num_tokens": 3313672.0, + "step": 1300 + }, + { + "entropy": 0.5422027057409287, + "epoch": 3.5064935064935066, + "grad_norm": 0.448234885931015, + "learning_rate": 9.010617432612243e-05, + "loss": 0.48615737915039064, + "mean_token_accuracy": 0.8517620968818664, + "num_tokens": 3440689.0, + "step": 1350 + }, + { + "entropy": 0.5518654137849808, + "epoch": 3.6363636363636362, + "grad_norm": 0.5707411170005798, + "learning_rate": 8.816248390102322e-05, + "loss": 0.4946014404296875, + "mean_token_accuracy": 0.8510974669456481, + "num_tokens": 3566427.0, + "step": 1400 + }, + { + "entropy": 0.5268254142999649, + "epoch": 3.7662337662337664, + "grad_norm": 0.42708849906921387, + "learning_rate": 8.615041531504609e-05, + "loss": 0.474882926940918, + "mean_token_accuracy": 0.8555294382572174, + "num_tokens": 3699976.0, + "step": 1450 + }, + { + "entropy": 0.5430487287044525, + "epoch": 3.896103896103896, + "grad_norm": 0.5479393005371094, + "learning_rate": 8.407410286525337e-05, + "loss": 0.48806171417236327, + "mean_token_accuracy": 0.8523763221502304, + "num_tokens": 3827926.0, + "step": 1500 + }, + { + "epoch": 4.0, + "eval_entropy": 0.6002918778417202, + "eval_loss": 0.6801126003265381, + "eval_mean_token_accuracy": 0.8130095520844827, + "eval_num_tokens": 3926200.0, + "eval_runtime": 47.9844, + "eval_samples_per_second": 34.532, + "eval_steps_per_second": 4.335, + "step": 1540 + }, + { + "entropy": 0.5307527387142181, + "epoch": 4.025974025974026, + "grad_norm": 0.5444459915161133, + "learning_rate": 8.193781285375899e-05, + "loss": 0.4741718292236328, + "mean_token_accuracy": 0.8564361107349395, + "num_tokens": 3953109.0, + "step": 1550 + }, + { + "entropy": 0.47564762085676193, + "epoch": 4.1558441558441555, + "grad_norm": 0.6210038065910339, + "learning_rate": 7.974593482154601e-05, + "loss": 0.41448020935058594, + "mean_token_accuracy": 0.8706729990243912, + "num_tokens": 4081079.0, + "step": 1600 + }, + { + "entropy": 0.4731892004609108, + "epoch": 4.285714285714286, + "grad_norm": 0.4910389482975006, + "learning_rate": 7.750297252905916e-05, + "loss": 0.4135689163208008, + "mean_token_accuracy": 0.8708927237987518, + "num_tokens": 4206426.0, + "step": 1650 + }, + { + "entropy": 0.47941224902868274, + "epoch": 4.415584415584416, + "grad_norm": 0.4900703430175781, + "learning_rate": 7.521353470210501e-05, + "loss": 0.4219230270385742, + "mean_token_accuracy": 0.8685608941316605, + "num_tokens": 4329839.0, + "step": 1700 + }, + { + "entropy": 0.48145264506340024, + "epoch": 4.545454545454545, + "grad_norm": 0.4503515064716339, + "learning_rate": 7.288232556207461e-05, + "loss": 0.4248290252685547, + "mean_token_accuracy": 0.8680109107494354, + "num_tokens": 4453940.0, + "step": 1750 + }, + { + "entropy": 0.47785325974226, + "epoch": 4.675324675324675, + "grad_norm": 0.5630834698677063, + "learning_rate": 7.051413515994661e-05, + "loss": 0.4244534683227539, + "mean_token_accuracy": 0.8692984575033188, + "num_tokens": 4583626.0, + "step": 1800 + }, + { + "entropy": 0.4738149631023407, + "epoch": 4.805194805194805, + "grad_norm": 0.5809922814369202, + "learning_rate": 6.811382953393207e-05, + "loss": 0.41768589019775393, + "mean_token_accuracy": 0.870254020690918, + "num_tokens": 4713540.0, + "step": 1850 + }, + { + "entropy": 0.47820780247449873, + "epoch": 4.935064935064935, + "grad_norm": 0.5658873915672302, + "learning_rate": 6.56863407109845e-05, + "loss": 0.42182437896728514, + "mean_token_accuracy": 0.8680490332841874, + "num_tokens": 4845678.0, + "step": 1900 + }, + { + "epoch": 5.0, + "eval_entropy": 0.5540635969776374, + "eval_loss": 0.6924836039543152, + "eval_mean_token_accuracy": 0.8140181821699326, + "eval_num_tokens": 4907750.0, + "eval_runtime": 47.9926, + "eval_samples_per_second": 34.526, + "eval_steps_per_second": 4.334, + "step": 1925 + }, + { + "entropy": 0.4371735429763794, + "epoch": 5.064935064935065, + "grad_norm": 0.5684086680412292, + "learning_rate": 6.323665657271966e-05, + "loss": 0.3749085998535156, + "mean_token_accuracy": 0.8815305006504058, + "num_tokens": 4975955.0, + "step": 1950 + }, + { + "entropy": 0.40612608641386033, + "epoch": 5.194805194805195, + "grad_norm": 0.6515666842460632, + "learning_rate": 6.076981060656787e-05, + "loss": 0.33952392578125, + "mean_token_accuracy": 0.8902835595607758, + "num_tokens": 5104770.0, + "step": 2000 + }, + { + "entropy": 0.41485957682132724, + "epoch": 5.324675324675325, + "grad_norm": 0.5700598359107971, + "learning_rate": 5.829087156321799e-05, + "loss": 0.345616455078125, + "mean_token_accuracy": 0.8897144883871079, + "num_tokens": 5229052.0, + "step": 2050 + }, + { + "entropy": 0.4269444864988327, + "epoch": 5.454545454545454, + "grad_norm": 0.6670826077461243, + "learning_rate": 5.580493304160404e-05, + "loss": 0.35833843231201173, + "mean_token_accuracy": 0.8866234600543976, + "num_tokens": 5357753.0, + "step": 2100 + }, + { + "entropy": 0.41144982814788816, + "epoch": 5.584415584415584, + "grad_norm": 0.620884120464325, + "learning_rate": 5.331710302283492e-05, + "loss": 0.3445538330078125, + "mean_token_accuracy": 0.8895936322212219, + "num_tokens": 5481550.0, + "step": 2150 + }, + { + "entropy": 0.41389220267534255, + "epoch": 5.714285714285714, + "grad_norm": 0.6487388610839844, + "learning_rate": 5.0832493374572605e-05, + "loss": 0.34858001708984376, + "mean_token_accuracy": 0.8874113804101944, + "num_tokens": 5609104.0, + "step": 2200 + }, + { + "entropy": 0.41483602195978164, + "epoch": 5.8441558441558445, + "grad_norm": 0.5946773290634155, + "learning_rate": 4.835620934742408e-05, + "loss": 0.3495229721069336, + "mean_token_accuracy": 0.8887655180692673, + "num_tokens": 5737347.0, + "step": 2250 + }, + { + "entropy": 0.4204313641786575, + "epoch": 5.974025974025974, + "grad_norm": 0.6658430099487305, + "learning_rate": 4.589333908492996e-05, + "loss": 0.3538378143310547, + "mean_token_accuracy": 0.8866806083917618, + "num_tokens": 5861872.0, + "step": 2300 + }, + { + "epoch": 6.0, + "eval_entropy": 0.4912281467650945, + "eval_loss": 0.7537463903427124, + "eval_mean_token_accuracy": 0.8104288898981534, + "eval_num_tokens": 5889300.0, + "eval_runtime": 48.0083, + "eval_samples_per_second": 34.515, + "eval_steps_per_second": 4.333, + "step": 2310 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.26902097163009e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..34b529d53848971eda3155b95c185289489ac0b8 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.04641649878824187, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ce9b160588db7f10e1f9cc8177b12783be16ecc1 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/trainer_state.json @@ -0,0 +1,641 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.0, + "eval_steps": 500, + "global_step": 2695, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.0450534927845, + "epoch": 0.12987012987012986, + "grad_norm": 0.9224470853805542, + "learning_rate": 1.3970570546126444e-05, + "loss": 1.944740753173828, + "mean_token_accuracy": 0.6158743992447853, + "num_tokens": 133878.0, + "step": 50 + }, + { + "entropy": 1.1079417771101, + "epoch": 0.2597402597402597, + "grad_norm": 0.6827074289321899, + "learning_rate": 2.822625477686771e-05, + "loss": 1.0624147033691407, + "mean_token_accuracy": 0.7396554726362229, + "num_tokens": 265533.0, + "step": 100 + }, + { + "entropy": 0.8689985585212707, + "epoch": 0.38961038961038963, + "grad_norm": 0.6320164203643799, + "learning_rate": 4.248193900760899e-05, + "loss": 0.8256858825683594, + "mean_token_accuracy": 0.7825630265474319, + "num_tokens": 386480.0, + "step": 150 + }, + { + "entropy": 0.7946180325746536, + "epoch": 0.5194805194805194, + "grad_norm": 0.7246975302696228, + "learning_rate": 5.6737623238350247e-05, + "loss": 0.7475, + "mean_token_accuracy": 0.7959125518798829, + "num_tokens": 520135.0, + "step": 200 + }, + { + "entropy": 0.7919283157587051, + "epoch": 0.6493506493506493, + "grad_norm": 0.7635198831558228, + "learning_rate": 7.099330746909153e-05, + "loss": 0.737699966430664, + "mean_token_accuracy": 0.798919832110405, + "num_tokens": 640303.0, + "step": 250 + }, + { + "entropy": 0.7713775283098221, + "epoch": 0.7792207792207793, + "grad_norm": 0.5751814842224121, + "learning_rate": 8.52489916998328e-05, + "loss": 0.7186790466308594, + "mean_token_accuracy": 0.8014196193218232, + "num_tokens": 766027.0, + "step": 300 + }, + { + "entropy": 0.737273331284523, + "epoch": 0.9090909090909091, + "grad_norm": 0.4956009089946747, + "learning_rate": 9.950467593057406e-05, + "loss": 0.6904002380371094, + "mean_token_accuracy": 0.8083628410100937, + "num_tokens": 894871.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.7681069333965962, + "eval_loss": 0.7430074214935303, + "eval_mean_token_accuracy": 0.7941452006881053, + "eval_num_tokens": 981550.0, + "eval_runtime": 47.9814, + "eval_samples_per_second": 34.534, + "eval_steps_per_second": 4.335, + "step": 385 + }, + { + "entropy": 0.7167584246397019, + "epoch": 1.0389610389610389, + "grad_norm": 0.638637900352478, + "learning_rate": 0.00010976434715123926, + "loss": 0.6611510467529297, + "mean_token_accuracy": 0.8145374125242233, + "num_tokens": 1021041.0, + "step": 400 + }, + { + "entropy": 0.6893188625574111, + "epoch": 1.1688311688311688, + "grad_norm": 0.554688036441803, + "learning_rate": 0.00010967639449071182, + "loss": 0.6347718811035157, + "mean_token_accuracy": 0.8226255452632905, + "num_tokens": 1149161.0, + "step": 450 + }, + { + "entropy": 0.6728485250473022, + "epoch": 1.2987012987012987, + "grad_norm": 0.5104691386222839, + "learning_rate": 0.00010947585797076785, + "loss": 0.6233362579345703, + "mean_token_accuracy": 0.8227335858345032, + "num_tokens": 1278079.0, + "step": 500 + }, + { + "entropy": 0.6812490409612656, + "epoch": 1.4285714285714286, + "grad_norm": 0.4700304865837097, + "learning_rate": 0.00010916314964373551, + "loss": 0.6289351654052734, + "mean_token_accuracy": 0.8217820060253144, + "num_tokens": 1405326.0, + "step": 550 + }, + { + "entropy": 0.6633523851633072, + "epoch": 1.5584415584415585, + "grad_norm": 0.5094137191772461, + "learning_rate": 0.0001087389120469154, + "loss": 0.6168266296386719, + "mean_token_accuracy": 0.8232935756444931, + "num_tokens": 1532507.0, + "step": 600 + }, + { + "entropy": 0.6454597359895706, + "epoch": 1.6883116883116882, + "grad_norm": 0.40546780824661255, + "learning_rate": 0.00010820401688232725, + "loss": 0.5995024108886718, + "mean_token_accuracy": 0.8284876370429992, + "num_tokens": 1662747.0, + "step": 650 + }, + { + "entropy": 0.6700882267951965, + "epoch": 1.8181818181818183, + "grad_norm": 0.35203394293785095, + "learning_rate": 0.00010755956322558065, + "loss": 0.616350212097168, + "mean_token_accuracy": 0.8246171402931214, + "num_tokens": 1783817.0, + "step": 700 + }, + { + "entropy": 0.6587968200445176, + "epoch": 1.948051948051948, + "grad_norm": 0.5248188376426697, + "learning_rate": 0.00010680687526754984, + "loss": 0.608861198425293, + "mean_token_accuracy": 0.8265628081560135, + "num_tokens": 1914573.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.7101548368541094, + "eval_loss": 0.6869887113571167, + "eval_mean_token_accuracy": 0.8058141409777678, + "eval_num_tokens": 1963100.0, + "eval_runtime": 48.0161, + "eval_samples_per_second": 34.509, + "eval_steps_per_second": 4.332, + "step": 770 + }, + { + "entropy": 0.6272834652662277, + "epoch": 2.0779220779220777, + "grad_norm": 0.5319918990135193, + "learning_rate": 0.00010594749959349313, + "loss": 0.5719264221191406, + "mean_token_accuracy": 0.8345229256153107, + "num_tokens": 2035968.0, + "step": 800 + }, + { + "entropy": 0.599158108830452, + "epoch": 2.207792207792208, + "grad_norm": 0.4296111762523651, + "learning_rate": 0.00010498320200520744, + "loss": 0.5460208129882812, + "mean_token_accuracy": 0.8396762716770172, + "num_tokens": 2168591.0, + "step": 850 + }, + { + "entropy": 0.5994478952884674, + "epoch": 2.3376623376623376, + "grad_norm": 0.3820176422595978, + "learning_rate": 0.00010391596389274791, + "loss": 0.5483282852172852, + "mean_token_accuracy": 0.8385387778282165, + "num_tokens": 2297967.0, + "step": 900 + }, + { + "entropy": 0.5989043036103249, + "epoch": 2.4675324675324677, + "grad_norm": 0.41718003153800964, + "learning_rate": 0.00010274797816316749, + "loss": 0.543673095703125, + "mean_token_accuracy": 0.8396250855922699, + "num_tokens": 2425614.0, + "step": 950 + }, + { + "entropy": 0.6005555561184883, + "epoch": 2.5974025974025974, + "grad_norm": 0.4552708864212036, + "learning_rate": 0.00010148164473464206, + "loss": 0.5505282974243164, + "mean_token_accuracy": 0.8371219438314438, + "num_tokens": 2557421.0, + "step": 1000 + }, + { + "entropy": 0.5972397547960281, + "epoch": 2.7272727272727275, + "grad_norm": 0.45796939730644226, + "learning_rate": 0.00010011956560523972, + "loss": 0.5410661697387695, + "mean_token_accuracy": 0.8398081564903259, + "num_tokens": 2680379.0, + "step": 1050 + }, + { + "entropy": 0.5875487339496612, + "epoch": 2.857142857142857, + "grad_norm": 0.37569308280944824, + "learning_rate": 9.866453950646624e-05, + "loss": 0.537100830078125, + "mean_token_accuracy": 0.8407874500751495, + "num_tokens": 2810854.0, + "step": 1100 + }, + { + "entropy": 0.6040622130036354, + "epoch": 2.987012987012987, + "grad_norm": 0.4184563457965851, + "learning_rate": 9.711955615257278e-05, + "loss": 0.5466165924072266, + "mean_token_accuracy": 0.8411829793453216, + "num_tokens": 2932637.0, + "step": 1150 + }, + { + "epoch": 3.0, + "eval_entropy": 0.6296488522337034, + "eval_loss": 0.671941876411438, + "eval_mean_token_accuracy": 0.8121863231062889, + "eval_num_tokens": 2944650.0, + "eval_runtime": 47.9946, + "eval_samples_per_second": 34.525, + "eval_steps_per_second": 4.334, + "step": 1155 + }, + { + "entropy": 0.543001911342144, + "epoch": 3.116883116883117, + "grad_norm": 0.39842459559440613, + "learning_rate": 9.548779009744178e-05, + "loss": 0.4850382995605469, + "mean_token_accuracy": 0.8531740349531174, + "num_tokens": 3054046.0, + "step": 1200 + }, + { + "entropy": 0.5417884975671768, + "epoch": 3.2467532467532467, + "grad_norm": 0.529087245464325, + "learning_rate": 9.37725942116738e-05, + "loss": 0.48398651123046876, + "mean_token_accuracy": 0.8532969230413436, + "num_tokens": 3182515.0, + "step": 1250 + }, + { + "entropy": 0.5348580208420753, + "epoch": 3.3766233766233764, + "grad_norm": 0.4483303129673004, + "learning_rate": 9.197749279327802e-05, + "loss": 0.4842509078979492, + "mean_token_accuracy": 0.8533288407325744, + "num_tokens": 3313672.0, + "step": 1300 + }, + { + "entropy": 0.5422027057409287, + "epoch": 3.5064935064935066, + "grad_norm": 0.448234885931015, + "learning_rate": 9.010617432612243e-05, + "loss": 0.48615737915039064, + "mean_token_accuracy": 0.8517620968818664, + "num_tokens": 3440689.0, + "step": 1350 + }, + { + "entropy": 0.5518654137849808, + "epoch": 3.6363636363636362, + "grad_norm": 0.5707411170005798, + "learning_rate": 8.816248390102322e-05, + "loss": 0.4946014404296875, + "mean_token_accuracy": 0.8510974669456481, + "num_tokens": 3566427.0, + "step": 1400 + }, + { + "entropy": 0.5268254142999649, + "epoch": 3.7662337662337664, + "grad_norm": 0.42708849906921387, + "learning_rate": 8.615041531504609e-05, + "loss": 0.474882926940918, + "mean_token_accuracy": 0.8555294382572174, + "num_tokens": 3699976.0, + "step": 1450 + }, + { + "entropy": 0.5430487287044525, + "epoch": 3.896103896103896, + "grad_norm": 0.5479393005371094, + "learning_rate": 8.407410286525337e-05, + "loss": 0.48806171417236327, + "mean_token_accuracy": 0.8523763221502304, + "num_tokens": 3827926.0, + "step": 1500 + }, + { + "epoch": 4.0, + "eval_entropy": 0.6002918778417202, + "eval_loss": 0.6801126003265381, + "eval_mean_token_accuracy": 0.8130095520844827, + "eval_num_tokens": 3926200.0, + "eval_runtime": 47.9844, + "eval_samples_per_second": 34.532, + "eval_steps_per_second": 4.335, + "step": 1540 + }, + { + "entropy": 0.5307527387142181, + "epoch": 4.025974025974026, + "grad_norm": 0.5444459915161133, + "learning_rate": 8.193781285375899e-05, + "loss": 0.4741718292236328, + "mean_token_accuracy": 0.8564361107349395, + "num_tokens": 3953109.0, + "step": 1550 + }, + { + "entropy": 0.47564762085676193, + "epoch": 4.1558441558441555, + "grad_norm": 0.6210038065910339, + "learning_rate": 7.974593482154601e-05, + "loss": 0.41448020935058594, + "mean_token_accuracy": 0.8706729990243912, + "num_tokens": 4081079.0, + "step": 1600 + }, + { + "entropy": 0.4731892004609108, + "epoch": 4.285714285714286, + "grad_norm": 0.4910389482975006, + "learning_rate": 7.750297252905916e-05, + "loss": 0.4135689163208008, + "mean_token_accuracy": 0.8708927237987518, + "num_tokens": 4206426.0, + "step": 1650 + }, + { + "entropy": 0.47941224902868274, + "epoch": 4.415584415584416, + "grad_norm": 0.4900703430175781, + "learning_rate": 7.521353470210501e-05, + "loss": 0.4219230270385742, + "mean_token_accuracy": 0.8685608941316605, + "num_tokens": 4329839.0, + "step": 1700 + }, + { + "entropy": 0.48145264506340024, + "epoch": 4.545454545454545, + "grad_norm": 0.4503515064716339, + "learning_rate": 7.288232556207461e-05, + "loss": 0.4248290252685547, + "mean_token_accuracy": 0.8680109107494354, + "num_tokens": 4453940.0, + "step": 1750 + }, + { + "entropy": 0.47785325974226, + "epoch": 4.675324675324675, + "grad_norm": 0.5630834698677063, + "learning_rate": 7.051413515994661e-05, + "loss": 0.4244534683227539, + "mean_token_accuracy": 0.8692984575033188, + "num_tokens": 4583626.0, + "step": 1800 + }, + { + "entropy": 0.4738149631023407, + "epoch": 4.805194805194805, + "grad_norm": 0.5809922814369202, + "learning_rate": 6.811382953393207e-05, + "loss": 0.41768589019775393, + "mean_token_accuracy": 0.870254020690918, + "num_tokens": 4713540.0, + "step": 1850 + }, + { + "entropy": 0.47820780247449873, + "epoch": 4.935064935064935, + "grad_norm": 0.5658873915672302, + "learning_rate": 6.56863407109845e-05, + "loss": 0.42182437896728514, + "mean_token_accuracy": 0.8680490332841874, + "num_tokens": 4845678.0, + "step": 1900 + }, + { + "epoch": 5.0, + "eval_entropy": 0.5540635969776374, + "eval_loss": 0.6924836039543152, + "eval_mean_token_accuracy": 0.8140181821699326, + "eval_num_tokens": 4907750.0, + "eval_runtime": 47.9926, + "eval_samples_per_second": 34.526, + "eval_steps_per_second": 4.334, + "step": 1925 + }, + { + "entropy": 0.4371735429763794, + "epoch": 5.064935064935065, + "grad_norm": 0.5684086680412292, + "learning_rate": 6.323665657271966e-05, + "loss": 0.3749085998535156, + "mean_token_accuracy": 0.8815305006504058, + "num_tokens": 4975955.0, + "step": 1950 + }, + { + "entropy": 0.40612608641386033, + "epoch": 5.194805194805195, + "grad_norm": 0.6515666842460632, + "learning_rate": 6.076981060656787e-05, + "loss": 0.33952392578125, + "mean_token_accuracy": 0.8902835595607758, + "num_tokens": 5104770.0, + "step": 2000 + }, + { + "entropy": 0.41485957682132724, + "epoch": 5.324675324675325, + "grad_norm": 0.5700598359107971, + "learning_rate": 5.829087156321799e-05, + "loss": 0.345616455078125, + "mean_token_accuracy": 0.8897144883871079, + "num_tokens": 5229052.0, + "step": 2050 + }, + { + "entropy": 0.4269444864988327, + "epoch": 5.454545454545454, + "grad_norm": 0.6670826077461243, + "learning_rate": 5.580493304160404e-05, + "loss": 0.35833843231201173, + "mean_token_accuracy": 0.8866234600543976, + "num_tokens": 5357753.0, + "step": 2100 + }, + { + "entropy": 0.41144982814788816, + "epoch": 5.584415584415584, + "grad_norm": 0.620884120464325, + "learning_rate": 5.331710302283492e-05, + "loss": 0.3445538330078125, + "mean_token_accuracy": 0.8895936322212219, + "num_tokens": 5481550.0, + "step": 2150 + }, + { + "entropy": 0.41389220267534255, + "epoch": 5.714285714285714, + "grad_norm": 0.6487388610839844, + "learning_rate": 5.0832493374572605e-05, + "loss": 0.34858001708984376, + "mean_token_accuracy": 0.8874113804101944, + "num_tokens": 5609104.0, + "step": 2200 + }, + { + "entropy": 0.41483602195978164, + "epoch": 5.8441558441558445, + "grad_norm": 0.5946773290634155, + "learning_rate": 4.835620934742408e-05, + "loss": 0.3495229721069336, + "mean_token_accuracy": 0.8887655180692673, + "num_tokens": 5737347.0, + "step": 2250 + }, + { + "entropy": 0.4204313641786575, + "epoch": 5.974025974025974, + "grad_norm": 0.6658430099487305, + "learning_rate": 4.589333908492996e-05, + "loss": 0.3538378143310547, + "mean_token_accuracy": 0.8866806083917618, + "num_tokens": 5861872.0, + "step": 2300 + }, + { + "epoch": 6.0, + "eval_entropy": 0.4912281467650945, + "eval_loss": 0.7537463903427124, + "eval_mean_token_accuracy": 0.8104288898981534, + "eval_num_tokens": 5889300.0, + "eval_runtime": 48.0083, + "eval_samples_per_second": 34.515, + "eval_steps_per_second": 4.333, + "step": 2310 + }, + { + "entropy": 0.3666571286320686, + "epoch": 6.103896103896104, + "grad_norm": 0.6230902671813965, + "learning_rate": 4.344894316870371e-05, + "loss": 0.2813127517700195, + "mean_token_accuracy": 0.9077165073156357, + "num_tokens": 5990679.0, + "step": 2350 + }, + { + "entropy": 0.3391442158818245, + "epoch": 6.233766233766234, + "grad_norm": 0.629700779914856, + "learning_rate": 4.1028044220203685e-05, + "loss": 0.26457656860351564, + "mean_token_accuracy": 0.9139899307489395, + "num_tokens": 6117918.0, + "step": 2400 + }, + { + "entropy": 0.34152675241231917, + "epoch": 6.363636363636363, + "grad_norm": 0.711188793182373, + "learning_rate": 3.863561658050396e-05, + "loss": 0.26950265884399416, + "mean_token_accuracy": 0.9120226174592971, + "num_tokens": 6248566.0, + "step": 2450 + }, + { + "entropy": 0.33951902255415917, + "epoch": 6.4935064935064934, + "grad_norm": 0.7751043438911438, + "learning_rate": 3.627657608926905e-05, + "loss": 0.26502132415771484, + "mean_token_accuracy": 0.9131791013479232, + "num_tokens": 6380529.0, + "step": 2500 + }, + { + "entropy": 0.3548544436693192, + "epoch": 6.623376623376624, + "grad_norm": 0.9152925610542297, + "learning_rate": 3.395576998393457e-05, + "loss": 0.27833885192871094, + "mean_token_accuracy": 0.9090453034639359, + "num_tokens": 6501669.0, + "step": 2550 + }, + { + "entropy": 0.3550854653120041, + "epoch": 6.753246753246753, + "grad_norm": 0.8495270013809204, + "learning_rate": 3.167796693984804e-05, + "loss": 0.27818309783935546, + "mean_token_accuracy": 0.9102006632089615, + "num_tokens": 6624404.0, + "step": 2600 + }, + { + "entropy": 0.34031844735145567, + "epoch": 6.883116883116883, + "grad_norm": 0.6764019727706909, + "learning_rate": 2.9447847271835456e-05, + "loss": 0.26656494140625, + "mean_token_accuracy": 0.9126953399181366, + "num_tokens": 6755532.0, + "step": 2650 + }, + { + "epoch": 7.0, + "eval_entropy": 0.4410387526911039, + "eval_loss": 0.8089802265167236, + "eval_mean_token_accuracy": 0.80925900173875, + "eval_num_tokens": 6870850.0, + "eval_runtime": 47.9562, + "eval_samples_per_second": 34.552, + "eval_steps_per_second": 4.337, + "step": 2695 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.810735902601032e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..34b529d53848971eda3155b95c185289489ac0b8 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.04641649878824187, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f469d9d84585a86efbdc7dcb7d9ed68565c3eb78 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/trainer_state.json @@ -0,0 +1,732 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.0, + "eval_steps": 500, + "global_step": 3080, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.0450534927845, + "epoch": 0.12987012987012986, + "grad_norm": 0.9224470853805542, + "learning_rate": 1.3970570546126444e-05, + "loss": 1.944740753173828, + "mean_token_accuracy": 0.6158743992447853, + "num_tokens": 133878.0, + "step": 50 + }, + { + "entropy": 1.1079417771101, + "epoch": 0.2597402597402597, + "grad_norm": 0.6827074289321899, + "learning_rate": 2.822625477686771e-05, + "loss": 1.0624147033691407, + "mean_token_accuracy": 0.7396554726362229, + "num_tokens": 265533.0, + "step": 100 + }, + { + "entropy": 0.8689985585212707, + "epoch": 0.38961038961038963, + "grad_norm": 0.6320164203643799, + "learning_rate": 4.248193900760899e-05, + "loss": 0.8256858825683594, + "mean_token_accuracy": 0.7825630265474319, + "num_tokens": 386480.0, + "step": 150 + }, + { + "entropy": 0.7946180325746536, + "epoch": 0.5194805194805194, + "grad_norm": 0.7246975302696228, + "learning_rate": 5.6737623238350247e-05, + "loss": 0.7475, + "mean_token_accuracy": 0.7959125518798829, + "num_tokens": 520135.0, + "step": 200 + }, + { + "entropy": 0.7919283157587051, + "epoch": 0.6493506493506493, + "grad_norm": 0.7635198831558228, + "learning_rate": 7.099330746909153e-05, + "loss": 0.737699966430664, + "mean_token_accuracy": 0.798919832110405, + "num_tokens": 640303.0, + "step": 250 + }, + { + "entropy": 0.7713775283098221, + "epoch": 0.7792207792207793, + "grad_norm": 0.5751814842224121, + "learning_rate": 8.52489916998328e-05, + "loss": 0.7186790466308594, + "mean_token_accuracy": 0.8014196193218232, + "num_tokens": 766027.0, + "step": 300 + }, + { + "entropy": 0.737273331284523, + "epoch": 0.9090909090909091, + "grad_norm": 0.4956009089946747, + "learning_rate": 9.950467593057406e-05, + "loss": 0.6904002380371094, + "mean_token_accuracy": 0.8083628410100937, + "num_tokens": 894871.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.7681069333965962, + "eval_loss": 0.7430074214935303, + "eval_mean_token_accuracy": 0.7941452006881053, + "eval_num_tokens": 981550.0, + "eval_runtime": 47.9814, + "eval_samples_per_second": 34.534, + "eval_steps_per_second": 4.335, + "step": 385 + }, + { + "entropy": 0.7167584246397019, + "epoch": 1.0389610389610389, + "grad_norm": 0.638637900352478, + "learning_rate": 0.00010976434715123926, + "loss": 0.6611510467529297, + "mean_token_accuracy": 0.8145374125242233, + "num_tokens": 1021041.0, + "step": 400 + }, + { + "entropy": 0.6893188625574111, + "epoch": 1.1688311688311688, + "grad_norm": 0.554688036441803, + "learning_rate": 0.00010967639449071182, + "loss": 0.6347718811035157, + "mean_token_accuracy": 0.8226255452632905, + "num_tokens": 1149161.0, + "step": 450 + }, + { + "entropy": 0.6728485250473022, + "epoch": 1.2987012987012987, + "grad_norm": 0.5104691386222839, + "learning_rate": 0.00010947585797076785, + "loss": 0.6233362579345703, + "mean_token_accuracy": 0.8227335858345032, + "num_tokens": 1278079.0, + "step": 500 + }, + { + "entropy": 0.6812490409612656, + "epoch": 1.4285714285714286, + "grad_norm": 0.4700304865837097, + "learning_rate": 0.00010916314964373551, + "loss": 0.6289351654052734, + "mean_token_accuracy": 0.8217820060253144, + "num_tokens": 1405326.0, + "step": 550 + }, + { + "entropy": 0.6633523851633072, + "epoch": 1.5584415584415585, + "grad_norm": 0.5094137191772461, + "learning_rate": 0.0001087389120469154, + "loss": 0.6168266296386719, + "mean_token_accuracy": 0.8232935756444931, + "num_tokens": 1532507.0, + "step": 600 + }, + { + "entropy": 0.6454597359895706, + "epoch": 1.6883116883116882, + "grad_norm": 0.40546780824661255, + "learning_rate": 0.00010820401688232725, + "loss": 0.5995024108886718, + "mean_token_accuracy": 0.8284876370429992, + "num_tokens": 1662747.0, + "step": 650 + }, + { + "entropy": 0.6700882267951965, + "epoch": 1.8181818181818183, + "grad_norm": 0.35203394293785095, + "learning_rate": 0.00010755956322558065, + "loss": 0.616350212097168, + "mean_token_accuracy": 0.8246171402931214, + "num_tokens": 1783817.0, + "step": 700 + }, + { + "entropy": 0.6587968200445176, + "epoch": 1.948051948051948, + "grad_norm": 0.5248188376426697, + "learning_rate": 0.00010680687526754984, + "loss": 0.608861198425293, + "mean_token_accuracy": 0.8265628081560135, + "num_tokens": 1914573.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.7101548368541094, + "eval_loss": 0.6869887113571167, + "eval_mean_token_accuracy": 0.8058141409777678, + "eval_num_tokens": 1963100.0, + "eval_runtime": 48.0161, + "eval_samples_per_second": 34.509, + "eval_steps_per_second": 4.332, + "step": 770 + }, + { + "entropy": 0.6272834652662277, + "epoch": 2.0779220779220777, + "grad_norm": 0.5319918990135193, + "learning_rate": 0.00010594749959349313, + "loss": 0.5719264221191406, + "mean_token_accuracy": 0.8345229256153107, + "num_tokens": 2035968.0, + "step": 800 + }, + { + "entropy": 0.599158108830452, + "epoch": 2.207792207792208, + "grad_norm": 0.4296111762523651, + "learning_rate": 0.00010498320200520744, + "loss": 0.5460208129882812, + "mean_token_accuracy": 0.8396762716770172, + "num_tokens": 2168591.0, + "step": 850 + }, + { + "entropy": 0.5994478952884674, + "epoch": 2.3376623376623376, + "grad_norm": 0.3820176422595978, + "learning_rate": 0.00010391596389274791, + "loss": 0.5483282852172852, + "mean_token_accuracy": 0.8385387778282165, + "num_tokens": 2297967.0, + "step": 900 + }, + { + "entropy": 0.5989043036103249, + "epoch": 2.4675324675324677, + "grad_norm": 0.41718003153800964, + "learning_rate": 0.00010274797816316749, + "loss": 0.543673095703125, + "mean_token_accuracy": 0.8396250855922699, + "num_tokens": 2425614.0, + "step": 950 + }, + { + "entropy": 0.6005555561184883, + "epoch": 2.5974025974025974, + "grad_norm": 0.4552708864212036, + "learning_rate": 0.00010148164473464206, + "loss": 0.5505282974243164, + "mean_token_accuracy": 0.8371219438314438, + "num_tokens": 2557421.0, + "step": 1000 + }, + { + "entropy": 0.5972397547960281, + "epoch": 2.7272727272727275, + "grad_norm": 0.45796939730644226, + "learning_rate": 0.00010011956560523972, + "loss": 0.5410661697387695, + "mean_token_accuracy": 0.8398081564903259, + "num_tokens": 2680379.0, + "step": 1050 + }, + { + "entropy": 0.5875487339496612, + "epoch": 2.857142857142857, + "grad_norm": 0.37569308280944824, + "learning_rate": 9.866453950646624e-05, + "loss": 0.537100830078125, + "mean_token_accuracy": 0.8407874500751495, + "num_tokens": 2810854.0, + "step": 1100 + }, + { + "entropy": 0.6040622130036354, + "epoch": 2.987012987012987, + "grad_norm": 0.4184563457965851, + "learning_rate": 9.711955615257278e-05, + "loss": 0.5466165924072266, + "mean_token_accuracy": 0.8411829793453216, + "num_tokens": 2932637.0, + "step": 1150 + }, + { + "epoch": 3.0, + "eval_entropy": 0.6296488522337034, + "eval_loss": 0.671941876411438, + "eval_mean_token_accuracy": 0.8121863231062889, + "eval_num_tokens": 2944650.0, + "eval_runtime": 47.9946, + "eval_samples_per_second": 34.525, + "eval_steps_per_second": 4.334, + "step": 1155 + }, + { + "entropy": 0.543001911342144, + "epoch": 3.116883116883117, + "grad_norm": 0.39842459559440613, + "learning_rate": 9.548779009744178e-05, + "loss": 0.4850382995605469, + "mean_token_accuracy": 0.8531740349531174, + "num_tokens": 3054046.0, + "step": 1200 + }, + { + "entropy": 0.5417884975671768, + "epoch": 3.2467532467532467, + "grad_norm": 0.529087245464325, + "learning_rate": 9.37725942116738e-05, + "loss": 0.48398651123046876, + "mean_token_accuracy": 0.8532969230413436, + "num_tokens": 3182515.0, + "step": 1250 + }, + { + "entropy": 0.5348580208420753, + "epoch": 3.3766233766233764, + "grad_norm": 0.4483303129673004, + "learning_rate": 9.197749279327802e-05, + "loss": 0.4842509078979492, + "mean_token_accuracy": 0.8533288407325744, + "num_tokens": 3313672.0, + "step": 1300 + }, + { + "entropy": 0.5422027057409287, + "epoch": 3.5064935064935066, + "grad_norm": 0.448234885931015, + "learning_rate": 9.010617432612243e-05, + "loss": 0.48615737915039064, + "mean_token_accuracy": 0.8517620968818664, + "num_tokens": 3440689.0, + "step": 1350 + }, + { + "entropy": 0.5518654137849808, + "epoch": 3.6363636363636362, + "grad_norm": 0.5707411170005798, + "learning_rate": 8.816248390102322e-05, + "loss": 0.4946014404296875, + "mean_token_accuracy": 0.8510974669456481, + "num_tokens": 3566427.0, + "step": 1400 + }, + { + "entropy": 0.5268254142999649, + "epoch": 3.7662337662337664, + "grad_norm": 0.42708849906921387, + "learning_rate": 8.615041531504609e-05, + "loss": 0.474882926940918, + "mean_token_accuracy": 0.8555294382572174, + "num_tokens": 3699976.0, + "step": 1450 + }, + { + "entropy": 0.5430487287044525, + "epoch": 3.896103896103896, + "grad_norm": 0.5479393005371094, + "learning_rate": 8.407410286525337e-05, + "loss": 0.48806171417236327, + "mean_token_accuracy": 0.8523763221502304, + "num_tokens": 3827926.0, + "step": 1500 + }, + { + "epoch": 4.0, + "eval_entropy": 0.6002918778417202, + "eval_loss": 0.6801126003265381, + "eval_mean_token_accuracy": 0.8130095520844827, + "eval_num_tokens": 3926200.0, + "eval_runtime": 47.9844, + "eval_samples_per_second": 34.532, + "eval_steps_per_second": 4.335, + "step": 1540 + }, + { + "entropy": 0.5307527387142181, + "epoch": 4.025974025974026, + "grad_norm": 0.5444459915161133, + "learning_rate": 8.193781285375899e-05, + "loss": 0.4741718292236328, + "mean_token_accuracy": 0.8564361107349395, + "num_tokens": 3953109.0, + "step": 1550 + }, + { + "entropy": 0.47564762085676193, + "epoch": 4.1558441558441555, + "grad_norm": 0.6210038065910339, + "learning_rate": 7.974593482154601e-05, + "loss": 0.41448020935058594, + "mean_token_accuracy": 0.8706729990243912, + "num_tokens": 4081079.0, + "step": 1600 + }, + { + "entropy": 0.4731892004609108, + "epoch": 4.285714285714286, + "grad_norm": 0.4910389482975006, + "learning_rate": 7.750297252905916e-05, + "loss": 0.4135689163208008, + "mean_token_accuracy": 0.8708927237987518, + "num_tokens": 4206426.0, + "step": 1650 + }, + { + "entropy": 0.47941224902868274, + "epoch": 4.415584415584416, + "grad_norm": 0.4900703430175781, + "learning_rate": 7.521353470210501e-05, + "loss": 0.4219230270385742, + "mean_token_accuracy": 0.8685608941316605, + "num_tokens": 4329839.0, + "step": 1700 + }, + { + "entropy": 0.48145264506340024, + "epoch": 4.545454545454545, + "grad_norm": 0.4503515064716339, + "learning_rate": 7.288232556207461e-05, + "loss": 0.4248290252685547, + "mean_token_accuracy": 0.8680109107494354, + "num_tokens": 4453940.0, + "step": 1750 + }, + { + "entropy": 0.47785325974226, + "epoch": 4.675324675324675, + "grad_norm": 0.5630834698677063, + "learning_rate": 7.051413515994661e-05, + "loss": 0.4244534683227539, + "mean_token_accuracy": 0.8692984575033188, + "num_tokens": 4583626.0, + "step": 1800 + }, + { + "entropy": 0.4738149631023407, + "epoch": 4.805194805194805, + "grad_norm": 0.5809922814369202, + "learning_rate": 6.811382953393207e-05, + "loss": 0.41768589019775393, + "mean_token_accuracy": 0.870254020690918, + "num_tokens": 4713540.0, + "step": 1850 + }, + { + "entropy": 0.47820780247449873, + "epoch": 4.935064935064935, + "grad_norm": 0.5658873915672302, + "learning_rate": 6.56863407109845e-05, + "loss": 0.42182437896728514, + "mean_token_accuracy": 0.8680490332841874, + "num_tokens": 4845678.0, + "step": 1900 + }, + { + "epoch": 5.0, + "eval_entropy": 0.5540635969776374, + "eval_loss": 0.6924836039543152, + "eval_mean_token_accuracy": 0.8140181821699326, + "eval_num_tokens": 4907750.0, + "eval_runtime": 47.9926, + "eval_samples_per_second": 34.526, + "eval_steps_per_second": 4.334, + "step": 1925 + }, + { + "entropy": 0.4371735429763794, + "epoch": 5.064935064935065, + "grad_norm": 0.5684086680412292, + "learning_rate": 6.323665657271966e-05, + "loss": 0.3749085998535156, + "mean_token_accuracy": 0.8815305006504058, + "num_tokens": 4975955.0, + "step": 1950 + }, + { + "entropy": 0.40612608641386033, + "epoch": 5.194805194805195, + "grad_norm": 0.6515666842460632, + "learning_rate": 6.076981060656787e-05, + "loss": 0.33952392578125, + "mean_token_accuracy": 0.8902835595607758, + "num_tokens": 5104770.0, + "step": 2000 + }, + { + "entropy": 0.41485957682132724, + "epoch": 5.324675324675325, + "grad_norm": 0.5700598359107971, + "learning_rate": 5.829087156321799e-05, + "loss": 0.345616455078125, + "mean_token_accuracy": 0.8897144883871079, + "num_tokens": 5229052.0, + "step": 2050 + }, + { + "entropy": 0.4269444864988327, + "epoch": 5.454545454545454, + "grad_norm": 0.6670826077461243, + "learning_rate": 5.580493304160404e-05, + "loss": 0.35833843231201173, + "mean_token_accuracy": 0.8866234600543976, + "num_tokens": 5357753.0, + "step": 2100 + }, + { + "entropy": 0.41144982814788816, + "epoch": 5.584415584415584, + "grad_norm": 0.620884120464325, + "learning_rate": 5.331710302283492e-05, + "loss": 0.3445538330078125, + "mean_token_accuracy": 0.8895936322212219, + "num_tokens": 5481550.0, + "step": 2150 + }, + { + "entropy": 0.41389220267534255, + "epoch": 5.714285714285714, + "grad_norm": 0.6487388610839844, + "learning_rate": 5.0832493374572605e-05, + "loss": 0.34858001708984376, + "mean_token_accuracy": 0.8874113804101944, + "num_tokens": 5609104.0, + "step": 2200 + }, + { + "entropy": 0.41483602195978164, + "epoch": 5.8441558441558445, + "grad_norm": 0.5946773290634155, + "learning_rate": 4.835620934742408e-05, + "loss": 0.3495229721069336, + "mean_token_accuracy": 0.8887655180692673, + "num_tokens": 5737347.0, + "step": 2250 + }, + { + "entropy": 0.4204313641786575, + "epoch": 5.974025974025974, + "grad_norm": 0.6658430099487305, + "learning_rate": 4.589333908492996e-05, + "loss": 0.3538378143310547, + "mean_token_accuracy": 0.8866806083917618, + "num_tokens": 5861872.0, + "step": 2300 + }, + { + "epoch": 6.0, + "eval_entropy": 0.4912281467650945, + "eval_loss": 0.7537463903427124, + "eval_mean_token_accuracy": 0.8104288898981534, + "eval_num_tokens": 5889300.0, + "eval_runtime": 48.0083, + "eval_samples_per_second": 34.515, + "eval_steps_per_second": 4.333, + "step": 2310 + }, + { + "entropy": 0.3666571286320686, + "epoch": 6.103896103896104, + "grad_norm": 0.6230902671813965, + "learning_rate": 4.344894316870371e-05, + "loss": 0.2813127517700195, + "mean_token_accuracy": 0.9077165073156357, + "num_tokens": 5990679.0, + "step": 2350 + }, + { + "entropy": 0.3391442158818245, + "epoch": 6.233766233766234, + "grad_norm": 0.629700779914856, + "learning_rate": 4.1028044220203685e-05, + "loss": 0.26457656860351564, + "mean_token_accuracy": 0.9139899307489395, + "num_tokens": 6117918.0, + "step": 2400 + }, + { + "entropy": 0.34152675241231917, + "epoch": 6.363636363636363, + "grad_norm": 0.711188793182373, + "learning_rate": 3.863561658050396e-05, + "loss": 0.26950265884399416, + "mean_token_accuracy": 0.9120226174592971, + "num_tokens": 6248566.0, + "step": 2450 + }, + { + "entropy": 0.33951902255415917, + "epoch": 6.4935064935064934, + "grad_norm": 0.7751043438911438, + "learning_rate": 3.627657608926905e-05, + "loss": 0.26502132415771484, + "mean_token_accuracy": 0.9131791013479232, + "num_tokens": 6380529.0, + "step": 2500 + }, + { + "entropy": 0.3548544436693192, + "epoch": 6.623376623376624, + "grad_norm": 0.9152925610542297, + "learning_rate": 3.395576998393457e-05, + "loss": 0.27833885192871094, + "mean_token_accuracy": 0.9090453034639359, + "num_tokens": 6501669.0, + "step": 2550 + }, + { + "entropy": 0.3550854653120041, + "epoch": 6.753246753246753, + "grad_norm": 0.8495270013809204, + "learning_rate": 3.167796693984804e-05, + "loss": 0.27818309783935546, + "mean_token_accuracy": 0.9102006632089615, + "num_tokens": 6624404.0, + "step": 2600 + }, + { + "entropy": 0.34031844735145567, + "epoch": 6.883116883116883, + "grad_norm": 0.6764019727706909, + "learning_rate": 2.9447847271835456e-05, + "loss": 0.26656494140625, + "mean_token_accuracy": 0.9126953399181366, + "num_tokens": 6755532.0, + "step": 2650 + }, + { + "epoch": 7.0, + "eval_entropy": 0.4410387526911039, + "eval_loss": 0.8089802265167236, + "eval_mean_token_accuracy": 0.80925900173875, + "eval_num_tokens": 6870850.0, + "eval_runtime": 47.9562, + "eval_samples_per_second": 34.552, + "eval_steps_per_second": 4.337, + "step": 2695 + }, + { + "entropy": 0.335344740152359, + "epoch": 7.012987012987013, + "grad_norm": 0.6364794373512268, + "learning_rate": 2.7269993317326242e-05, + "loss": 0.25932022094726564, + "mean_token_accuracy": 0.9156477284431458, + "num_tokens": 6884338.0, + "step": 2700 + }, + { + "entropy": 0.283698299229145, + "epoch": 7.142857142857143, + "grad_norm": 0.9388208389282227, + "learning_rate": 2.514888002079755e-05, + "loss": 0.19749004364013673, + "mean_token_accuracy": 0.9353394263982773, + "num_tokens": 7008981.0, + "step": 2750 + }, + { + "entropy": 0.27516734033823015, + "epoch": 7.2727272727272725, + "grad_norm": 0.706503689289093, + "learning_rate": 2.3088865738883814e-05, + "loss": 0.19110334396362305, + "mean_token_accuracy": 0.9379108762741089, + "num_tokens": 7135402.0, + "step": 2800 + }, + { + "entropy": 0.273246209025383, + "epoch": 7.402597402597403, + "grad_norm": 0.7857301235198975, + "learning_rate": 2.1094183285045552e-05, + "loss": 0.19094297409057617, + "mean_token_accuracy": 0.9369927847385406, + "num_tokens": 7265920.0, + "step": 2850 + }, + { + "entropy": 0.2801500430703163, + "epoch": 7.532467532467533, + "grad_norm": 0.7793248891830444, + "learning_rate": 1.9168931232197576e-05, + "loss": 0.19656993865966796, + "mean_token_accuracy": 0.9351688891649246, + "num_tokens": 7389633.0, + "step": 2900 + }, + { + "entropy": 0.27495736733078957, + "epoch": 7.662337662337662, + "grad_norm": 0.8637392520904541, + "learning_rate": 1.7317065491168085e-05, + "loss": 0.1936025810241699, + "mean_token_accuracy": 0.9363743001222611, + "num_tokens": 7518696.0, + "step": 2950 + }, + { + "entropy": 0.2810053497552872, + "epoch": 7.792207792207792, + "grad_norm": 0.7629940509796143, + "learning_rate": 1.554239118229261e-05, + "loss": 0.1976767921447754, + "mean_token_accuracy": 0.9348012053966522, + "num_tokens": 7643525.0, + "step": 3000 + }, + { + "entropy": 0.272479218095541, + "epoch": 7.922077922077922, + "grad_norm": 0.8325297832489014, + "learning_rate": 1.3848554816844692e-05, + "loss": 0.1889443016052246, + "mean_token_accuracy": 0.9384464406967163, + "num_tokens": 7773494.0, + "step": 3050 + }, + { + "epoch": 8.0, + "eval_entropy": 0.4007443663879083, + "eval_loss": 0.8944177031517029, + "eval_mean_token_accuracy": 0.8053252046497968, + "eval_num_tokens": 7852400.0, + "eval_runtime": 47.9727, + "eval_samples_per_second": 34.54, + "eval_steps_per_second": 4.336, + "step": 3080 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.3542904203667354e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..34b529d53848971eda3155b95c185289489ac0b8 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.04641649878824187, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3513d449f09295b638b1af75e4254ea4b8f2ace5 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/trainer_state.json @@ -0,0 +1,823 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.0, + "eval_steps": 500, + "global_step": 3465, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.0450534927845, + "epoch": 0.12987012987012986, + "grad_norm": 0.9224470853805542, + "learning_rate": 1.3970570546126444e-05, + "loss": 1.944740753173828, + "mean_token_accuracy": 0.6158743992447853, + "num_tokens": 133878.0, + "step": 50 + }, + { + "entropy": 1.1079417771101, + "epoch": 0.2597402597402597, + "grad_norm": 0.6827074289321899, + "learning_rate": 2.822625477686771e-05, + "loss": 1.0624147033691407, + "mean_token_accuracy": 0.7396554726362229, + "num_tokens": 265533.0, + "step": 100 + }, + { + "entropy": 0.8689985585212707, + "epoch": 0.38961038961038963, + "grad_norm": 0.6320164203643799, + "learning_rate": 4.248193900760899e-05, + "loss": 0.8256858825683594, + "mean_token_accuracy": 0.7825630265474319, + "num_tokens": 386480.0, + "step": 150 + }, + { + "entropy": 0.7946180325746536, + "epoch": 0.5194805194805194, + "grad_norm": 0.7246975302696228, + "learning_rate": 5.6737623238350247e-05, + "loss": 0.7475, + "mean_token_accuracy": 0.7959125518798829, + "num_tokens": 520135.0, + "step": 200 + }, + { + "entropy": 0.7919283157587051, + "epoch": 0.6493506493506493, + "grad_norm": 0.7635198831558228, + "learning_rate": 7.099330746909153e-05, + "loss": 0.737699966430664, + "mean_token_accuracy": 0.798919832110405, + "num_tokens": 640303.0, + "step": 250 + }, + { + "entropy": 0.7713775283098221, + "epoch": 0.7792207792207793, + "grad_norm": 0.5751814842224121, + "learning_rate": 8.52489916998328e-05, + "loss": 0.7186790466308594, + "mean_token_accuracy": 0.8014196193218232, + "num_tokens": 766027.0, + "step": 300 + }, + { + "entropy": 0.737273331284523, + "epoch": 0.9090909090909091, + "grad_norm": 0.4956009089946747, + "learning_rate": 9.950467593057406e-05, + "loss": 0.6904002380371094, + "mean_token_accuracy": 0.8083628410100937, + "num_tokens": 894871.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.7681069333965962, + "eval_loss": 0.7430074214935303, + "eval_mean_token_accuracy": 0.7941452006881053, + "eval_num_tokens": 981550.0, + "eval_runtime": 47.9814, + "eval_samples_per_second": 34.534, + "eval_steps_per_second": 4.335, + "step": 385 + }, + { + "entropy": 0.7167584246397019, + "epoch": 1.0389610389610389, + "grad_norm": 0.638637900352478, + "learning_rate": 0.00010976434715123926, + "loss": 0.6611510467529297, + "mean_token_accuracy": 0.8145374125242233, + "num_tokens": 1021041.0, + "step": 400 + }, + { + "entropy": 0.6893188625574111, + "epoch": 1.1688311688311688, + "grad_norm": 0.554688036441803, + "learning_rate": 0.00010967639449071182, + "loss": 0.6347718811035157, + "mean_token_accuracy": 0.8226255452632905, + "num_tokens": 1149161.0, + "step": 450 + }, + { + "entropy": 0.6728485250473022, + "epoch": 1.2987012987012987, + "grad_norm": 0.5104691386222839, + "learning_rate": 0.00010947585797076785, + "loss": 0.6233362579345703, + "mean_token_accuracy": 0.8227335858345032, + "num_tokens": 1278079.0, + "step": 500 + }, + { + "entropy": 0.6812490409612656, + "epoch": 1.4285714285714286, + "grad_norm": 0.4700304865837097, + "learning_rate": 0.00010916314964373551, + "loss": 0.6289351654052734, + "mean_token_accuracy": 0.8217820060253144, + "num_tokens": 1405326.0, + "step": 550 + }, + { + "entropy": 0.6633523851633072, + "epoch": 1.5584415584415585, + "grad_norm": 0.5094137191772461, + "learning_rate": 0.0001087389120469154, + "loss": 0.6168266296386719, + "mean_token_accuracy": 0.8232935756444931, + "num_tokens": 1532507.0, + "step": 600 + }, + { + "entropy": 0.6454597359895706, + "epoch": 1.6883116883116882, + "grad_norm": 0.40546780824661255, + "learning_rate": 0.00010820401688232725, + "loss": 0.5995024108886718, + "mean_token_accuracy": 0.8284876370429992, + "num_tokens": 1662747.0, + "step": 650 + }, + { + "entropy": 0.6700882267951965, + "epoch": 1.8181818181818183, + "grad_norm": 0.35203394293785095, + "learning_rate": 0.00010755956322558065, + "loss": 0.616350212097168, + "mean_token_accuracy": 0.8246171402931214, + "num_tokens": 1783817.0, + "step": 700 + }, + { + "entropy": 0.6587968200445176, + "epoch": 1.948051948051948, + "grad_norm": 0.5248188376426697, + "learning_rate": 0.00010680687526754984, + "loss": 0.608861198425293, + "mean_token_accuracy": 0.8265628081560135, + "num_tokens": 1914573.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.7101548368541094, + "eval_loss": 0.6869887113571167, + "eval_mean_token_accuracy": 0.8058141409777678, + "eval_num_tokens": 1963100.0, + "eval_runtime": 48.0161, + "eval_samples_per_second": 34.509, + "eval_steps_per_second": 4.332, + "step": 770 + }, + { + "entropy": 0.6272834652662277, + "epoch": 2.0779220779220777, + "grad_norm": 0.5319918990135193, + "learning_rate": 0.00010594749959349313, + "loss": 0.5719264221191406, + "mean_token_accuracy": 0.8345229256153107, + "num_tokens": 2035968.0, + "step": 800 + }, + { + "entropy": 0.599158108830452, + "epoch": 2.207792207792208, + "grad_norm": 0.4296111762523651, + "learning_rate": 0.00010498320200520744, + "loss": 0.5460208129882812, + "mean_token_accuracy": 0.8396762716770172, + "num_tokens": 2168591.0, + "step": 850 + }, + { + "entropy": 0.5994478952884674, + "epoch": 2.3376623376623376, + "grad_norm": 0.3820176422595978, + "learning_rate": 0.00010391596389274791, + "loss": 0.5483282852172852, + "mean_token_accuracy": 0.8385387778282165, + "num_tokens": 2297967.0, + "step": 900 + }, + { + "entropy": 0.5989043036103249, + "epoch": 2.4675324675324677, + "grad_norm": 0.41718003153800964, + "learning_rate": 0.00010274797816316749, + "loss": 0.543673095703125, + "mean_token_accuracy": 0.8396250855922699, + "num_tokens": 2425614.0, + "step": 950 + }, + { + "entropy": 0.6005555561184883, + "epoch": 2.5974025974025974, + "grad_norm": 0.4552708864212036, + "learning_rate": 0.00010148164473464206, + "loss": 0.5505282974243164, + "mean_token_accuracy": 0.8371219438314438, + "num_tokens": 2557421.0, + "step": 1000 + }, + { + "entropy": 0.5972397547960281, + "epoch": 2.7272727272727275, + "grad_norm": 0.45796939730644226, + "learning_rate": 0.00010011956560523972, + "loss": 0.5410661697387695, + "mean_token_accuracy": 0.8398081564903259, + "num_tokens": 2680379.0, + "step": 1050 + }, + { + "entropy": 0.5875487339496612, + "epoch": 2.857142857142857, + "grad_norm": 0.37569308280944824, + "learning_rate": 9.866453950646624e-05, + "loss": 0.537100830078125, + "mean_token_accuracy": 0.8407874500751495, + "num_tokens": 2810854.0, + "step": 1100 + }, + { + "entropy": 0.6040622130036354, + "epoch": 2.987012987012987, + "grad_norm": 0.4184563457965851, + "learning_rate": 9.711955615257278e-05, + "loss": 0.5466165924072266, + "mean_token_accuracy": 0.8411829793453216, + "num_tokens": 2932637.0, + "step": 1150 + }, + { + "epoch": 3.0, + "eval_entropy": 0.6296488522337034, + "eval_loss": 0.671941876411438, + "eval_mean_token_accuracy": 0.8121863231062889, + "eval_num_tokens": 2944650.0, + "eval_runtime": 47.9946, + "eval_samples_per_second": 34.525, + "eval_steps_per_second": 4.334, + "step": 1155 + }, + { + "entropy": 0.543001911342144, + "epoch": 3.116883116883117, + "grad_norm": 0.39842459559440613, + "learning_rate": 9.548779009744178e-05, + "loss": 0.4850382995605469, + "mean_token_accuracy": 0.8531740349531174, + "num_tokens": 3054046.0, + "step": 1200 + }, + { + "entropy": 0.5417884975671768, + "epoch": 3.2467532467532467, + "grad_norm": 0.529087245464325, + "learning_rate": 9.37725942116738e-05, + "loss": 0.48398651123046876, + "mean_token_accuracy": 0.8532969230413436, + "num_tokens": 3182515.0, + "step": 1250 + }, + { + "entropy": 0.5348580208420753, + "epoch": 3.3766233766233764, + "grad_norm": 0.4483303129673004, + "learning_rate": 9.197749279327802e-05, + "loss": 0.4842509078979492, + "mean_token_accuracy": 0.8533288407325744, + "num_tokens": 3313672.0, + "step": 1300 + }, + { + "entropy": 0.5422027057409287, + "epoch": 3.5064935064935066, + "grad_norm": 0.448234885931015, + "learning_rate": 9.010617432612243e-05, + "loss": 0.48615737915039064, + "mean_token_accuracy": 0.8517620968818664, + "num_tokens": 3440689.0, + "step": 1350 + }, + { + "entropy": 0.5518654137849808, + "epoch": 3.6363636363636362, + "grad_norm": 0.5707411170005798, + "learning_rate": 8.816248390102322e-05, + "loss": 0.4946014404296875, + "mean_token_accuracy": 0.8510974669456481, + "num_tokens": 3566427.0, + "step": 1400 + }, + { + "entropy": 0.5268254142999649, + "epoch": 3.7662337662337664, + "grad_norm": 0.42708849906921387, + "learning_rate": 8.615041531504609e-05, + "loss": 0.474882926940918, + "mean_token_accuracy": 0.8555294382572174, + "num_tokens": 3699976.0, + "step": 1450 + }, + { + "entropy": 0.5430487287044525, + "epoch": 3.896103896103896, + "grad_norm": 0.5479393005371094, + "learning_rate": 8.407410286525337e-05, + "loss": 0.48806171417236327, + "mean_token_accuracy": 0.8523763221502304, + "num_tokens": 3827926.0, + "step": 1500 + }, + { + "epoch": 4.0, + "eval_entropy": 0.6002918778417202, + "eval_loss": 0.6801126003265381, + "eval_mean_token_accuracy": 0.8130095520844827, + "eval_num_tokens": 3926200.0, + "eval_runtime": 47.9844, + "eval_samples_per_second": 34.532, + "eval_steps_per_second": 4.335, + "step": 1540 + }, + { + "entropy": 0.5307527387142181, + "epoch": 4.025974025974026, + "grad_norm": 0.5444459915161133, + "learning_rate": 8.193781285375899e-05, + "loss": 0.4741718292236328, + "mean_token_accuracy": 0.8564361107349395, + "num_tokens": 3953109.0, + "step": 1550 + }, + { + "entropy": 0.47564762085676193, + "epoch": 4.1558441558441555, + "grad_norm": 0.6210038065910339, + "learning_rate": 7.974593482154601e-05, + "loss": 0.41448020935058594, + "mean_token_accuracy": 0.8706729990243912, + "num_tokens": 4081079.0, + "step": 1600 + }, + { + "entropy": 0.4731892004609108, + "epoch": 4.285714285714286, + "grad_norm": 0.4910389482975006, + "learning_rate": 7.750297252905916e-05, + "loss": 0.4135689163208008, + "mean_token_accuracy": 0.8708927237987518, + "num_tokens": 4206426.0, + "step": 1650 + }, + { + "entropy": 0.47941224902868274, + "epoch": 4.415584415584416, + "grad_norm": 0.4900703430175781, + "learning_rate": 7.521353470210501e-05, + "loss": 0.4219230270385742, + "mean_token_accuracy": 0.8685608941316605, + "num_tokens": 4329839.0, + "step": 1700 + }, + { + "entropy": 0.48145264506340024, + "epoch": 4.545454545454545, + "grad_norm": 0.4503515064716339, + "learning_rate": 7.288232556207461e-05, + "loss": 0.4248290252685547, + "mean_token_accuracy": 0.8680109107494354, + "num_tokens": 4453940.0, + "step": 1750 + }, + { + "entropy": 0.47785325974226, + "epoch": 4.675324675324675, + "grad_norm": 0.5630834698677063, + "learning_rate": 7.051413515994661e-05, + "loss": 0.4244534683227539, + "mean_token_accuracy": 0.8692984575033188, + "num_tokens": 4583626.0, + "step": 1800 + }, + { + "entropy": 0.4738149631023407, + "epoch": 4.805194805194805, + "grad_norm": 0.5809922814369202, + "learning_rate": 6.811382953393207e-05, + "loss": 0.41768589019775393, + "mean_token_accuracy": 0.870254020690918, + "num_tokens": 4713540.0, + "step": 1850 + }, + { + "entropy": 0.47820780247449873, + "epoch": 4.935064935064935, + "grad_norm": 0.5658873915672302, + "learning_rate": 6.56863407109845e-05, + "loss": 0.42182437896728514, + "mean_token_accuracy": 0.8680490332841874, + "num_tokens": 4845678.0, + "step": 1900 + }, + { + "epoch": 5.0, + "eval_entropy": 0.5540635969776374, + "eval_loss": 0.6924836039543152, + "eval_mean_token_accuracy": 0.8140181821699326, + "eval_num_tokens": 4907750.0, + "eval_runtime": 47.9926, + "eval_samples_per_second": 34.526, + "eval_steps_per_second": 4.334, + "step": 1925 + }, + { + "entropy": 0.4371735429763794, + "epoch": 5.064935064935065, + "grad_norm": 0.5684086680412292, + "learning_rate": 6.323665657271966e-05, + "loss": 0.3749085998535156, + "mean_token_accuracy": 0.8815305006504058, + "num_tokens": 4975955.0, + "step": 1950 + }, + { + "entropy": 0.40612608641386033, + "epoch": 5.194805194805195, + "grad_norm": 0.6515666842460632, + "learning_rate": 6.076981060656787e-05, + "loss": 0.33952392578125, + "mean_token_accuracy": 0.8902835595607758, + "num_tokens": 5104770.0, + "step": 2000 + }, + { + "entropy": 0.41485957682132724, + "epoch": 5.324675324675325, + "grad_norm": 0.5700598359107971, + "learning_rate": 5.829087156321799e-05, + "loss": 0.345616455078125, + "mean_token_accuracy": 0.8897144883871079, + "num_tokens": 5229052.0, + "step": 2050 + }, + { + "entropy": 0.4269444864988327, + "epoch": 5.454545454545454, + "grad_norm": 0.6670826077461243, + "learning_rate": 5.580493304160404e-05, + "loss": 0.35833843231201173, + "mean_token_accuracy": 0.8866234600543976, + "num_tokens": 5357753.0, + "step": 2100 + }, + { + "entropy": 0.41144982814788816, + "epoch": 5.584415584415584, + "grad_norm": 0.620884120464325, + "learning_rate": 5.331710302283492e-05, + "loss": 0.3445538330078125, + "mean_token_accuracy": 0.8895936322212219, + "num_tokens": 5481550.0, + "step": 2150 + }, + { + "entropy": 0.41389220267534255, + "epoch": 5.714285714285714, + "grad_norm": 0.6487388610839844, + "learning_rate": 5.0832493374572605e-05, + "loss": 0.34858001708984376, + "mean_token_accuracy": 0.8874113804101944, + "num_tokens": 5609104.0, + "step": 2200 + }, + { + "entropy": 0.41483602195978164, + "epoch": 5.8441558441558445, + "grad_norm": 0.5946773290634155, + "learning_rate": 4.835620934742408e-05, + "loss": 0.3495229721069336, + "mean_token_accuracy": 0.8887655180692673, + "num_tokens": 5737347.0, + "step": 2250 + }, + { + "entropy": 0.4204313641786575, + "epoch": 5.974025974025974, + "grad_norm": 0.6658430099487305, + "learning_rate": 4.589333908492996e-05, + "loss": 0.3538378143310547, + "mean_token_accuracy": 0.8866806083917618, + "num_tokens": 5861872.0, + "step": 2300 + }, + { + "epoch": 6.0, + "eval_entropy": 0.4912281467650945, + "eval_loss": 0.7537463903427124, + "eval_mean_token_accuracy": 0.8104288898981534, + "eval_num_tokens": 5889300.0, + "eval_runtime": 48.0083, + "eval_samples_per_second": 34.515, + "eval_steps_per_second": 4.333, + "step": 2310 + }, + { + "entropy": 0.3666571286320686, + "epoch": 6.103896103896104, + "grad_norm": 0.6230902671813965, + "learning_rate": 4.344894316870371e-05, + "loss": 0.2813127517700195, + "mean_token_accuracy": 0.9077165073156357, + "num_tokens": 5990679.0, + "step": 2350 + }, + { + "entropy": 0.3391442158818245, + "epoch": 6.233766233766234, + "grad_norm": 0.629700779914856, + "learning_rate": 4.1028044220203685e-05, + "loss": 0.26457656860351564, + "mean_token_accuracy": 0.9139899307489395, + "num_tokens": 6117918.0, + "step": 2400 + }, + { + "entropy": 0.34152675241231917, + "epoch": 6.363636363636363, + "grad_norm": 0.711188793182373, + "learning_rate": 3.863561658050396e-05, + "loss": 0.26950265884399416, + "mean_token_accuracy": 0.9120226174592971, + "num_tokens": 6248566.0, + "step": 2450 + }, + { + "entropy": 0.33951902255415917, + "epoch": 6.4935064935064934, + "grad_norm": 0.7751043438911438, + "learning_rate": 3.627657608926905e-05, + "loss": 0.26502132415771484, + "mean_token_accuracy": 0.9131791013479232, + "num_tokens": 6380529.0, + "step": 2500 + }, + { + "entropy": 0.3548544436693192, + "epoch": 6.623376623376624, + "grad_norm": 0.9152925610542297, + "learning_rate": 3.395576998393457e-05, + "loss": 0.27833885192871094, + "mean_token_accuracy": 0.9090453034639359, + "num_tokens": 6501669.0, + "step": 2550 + }, + { + "entropy": 0.3550854653120041, + "epoch": 6.753246753246753, + "grad_norm": 0.8495270013809204, + "learning_rate": 3.167796693984804e-05, + "loss": 0.27818309783935546, + "mean_token_accuracy": 0.9102006632089615, + "num_tokens": 6624404.0, + "step": 2600 + }, + { + "entropy": 0.34031844735145567, + "epoch": 6.883116883116883, + "grad_norm": 0.6764019727706909, + "learning_rate": 2.9447847271835456e-05, + "loss": 0.26656494140625, + "mean_token_accuracy": 0.9126953399181366, + "num_tokens": 6755532.0, + "step": 2650 + }, + { + "epoch": 7.0, + "eval_entropy": 0.4410387526911039, + "eval_loss": 0.8089802265167236, + "eval_mean_token_accuracy": 0.80925900173875, + "eval_num_tokens": 6870850.0, + "eval_runtime": 47.9562, + "eval_samples_per_second": 34.552, + "eval_steps_per_second": 4.337, + "step": 2695 + }, + { + "entropy": 0.335344740152359, + "epoch": 7.012987012987013, + "grad_norm": 0.6364794373512268, + "learning_rate": 2.7269993317326242e-05, + "loss": 0.25932022094726564, + "mean_token_accuracy": 0.9156477284431458, + "num_tokens": 6884338.0, + "step": 2700 + }, + { + "entropy": 0.283698299229145, + "epoch": 7.142857142857143, + "grad_norm": 0.9388208389282227, + "learning_rate": 2.514888002079755e-05, + "loss": 0.19749004364013673, + "mean_token_accuracy": 0.9353394263982773, + "num_tokens": 7008981.0, + "step": 2750 + }, + { + "entropy": 0.27516734033823015, + "epoch": 7.2727272727272725, + "grad_norm": 0.706503689289093, + "learning_rate": 2.3088865738883814e-05, + "loss": 0.19110334396362305, + "mean_token_accuracy": 0.9379108762741089, + "num_tokens": 7135402.0, + "step": 2800 + }, + { + "entropy": 0.273246209025383, + "epoch": 7.402597402597403, + "grad_norm": 0.7857301235198975, + "learning_rate": 2.1094183285045552e-05, + "loss": 0.19094297409057617, + "mean_token_accuracy": 0.9369927847385406, + "num_tokens": 7265920.0, + "step": 2850 + }, + { + "entropy": 0.2801500430703163, + "epoch": 7.532467532467533, + "grad_norm": 0.7793248891830444, + "learning_rate": 1.9168931232197576e-05, + "loss": 0.19656993865966796, + "mean_token_accuracy": 0.9351688891649246, + "num_tokens": 7389633.0, + "step": 2900 + }, + { + "entropy": 0.27495736733078957, + "epoch": 7.662337662337662, + "grad_norm": 0.8637392520904541, + "learning_rate": 1.7317065491168085e-05, + "loss": 0.1936025810241699, + "mean_token_accuracy": 0.9363743001222611, + "num_tokens": 7518696.0, + "step": 2950 + }, + { + "entropy": 0.2810053497552872, + "epoch": 7.792207792207792, + "grad_norm": 0.7629940509796143, + "learning_rate": 1.554239118229261e-05, + "loss": 0.1976767921447754, + "mean_token_accuracy": 0.9348012053966522, + "num_tokens": 7643525.0, + "step": 3000 + }, + { + "entropy": 0.272479218095541, + "epoch": 7.922077922077922, + "grad_norm": 0.8325297832489014, + "learning_rate": 1.3848554816844692e-05, + "loss": 0.1889443016052246, + "mean_token_accuracy": 0.9384464406967163, + "num_tokens": 7773494.0, + "step": 3050 + }, + { + "epoch": 8.0, + "eval_entropy": 0.4007443663879083, + "eval_loss": 0.8944177031517029, + "eval_mean_token_accuracy": 0.8053252046497968, + "eval_num_tokens": 7852400.0, + "eval_runtime": 47.9727, + "eval_samples_per_second": 34.54, + "eval_steps_per_second": 4.336, + "step": 3080 + }, + { + "entropy": 0.2529501300305128, + "epoch": 8.051948051948052, + "grad_norm": 0.7490427494049072, + "learning_rate": 1.2239036804368287e-05, + "loss": 0.1695658302307129, + "mean_token_accuracy": 0.9450542360544205, + "num_tokens": 7903348.0, + "step": 3100 + }, + { + "entropy": 0.2297460925579071, + "epoch": 8.181818181818182, + "grad_norm": 0.7783088088035583, + "learning_rate": 1.0717144301307847e-05, + "loss": 0.13803850173950194, + "mean_token_accuracy": 0.9550822985172271, + "num_tokens": 8028778.0, + "step": 3150 + }, + { + "entropy": 0.230171779692173, + "epoch": 8.311688311688311, + "grad_norm": 0.8110005259513855, + "learning_rate": 9.286004415629994e-06, + "loss": 0.1422537612915039, + "mean_token_accuracy": 0.9539634013175964, + "num_tokens": 8153699.0, + "step": 3200 + }, + { + "entropy": 0.22410715252161026, + "epoch": 8.441558441558442, + "grad_norm": 0.6850584745407104, + "learning_rate": 7.948557781399818e-06, + "loss": 0.13708532333374024, + "mean_token_accuracy": 0.9560226953029632, + "num_tokens": 8281408.0, + "step": 3250 + }, + { + "entropy": 0.23170628443360328, + "epoch": 8.571428571428571, + "grad_norm": 0.7979664206504822, + "learning_rate": 6.707552516514227e-06, + "loss": 0.14379706382751464, + "mean_token_accuracy": 0.9544130796194077, + "num_tokens": 8400657.0, + "step": 3300 + }, + { + "entropy": 0.2197262801229954, + "epoch": 8.7012987012987, + "grad_norm": 0.6074482202529907, + "learning_rate": 5.565538576007922e-06, + "loss": 0.13595272064208985, + "mean_token_accuracy": 0.9567223310470581, + "num_tokens": 8533080.0, + "step": 3350 + }, + { + "entropy": 0.2189832380414009, + "epoch": 8.831168831168831, + "grad_norm": 0.7105876207351685, + "learning_rate": 4.5248625125343745e-06, + "loss": 0.13385194778442383, + "mean_token_accuracy": 0.9567512100934983, + "num_tokens": 8664922.0, + "step": 3400 + }, + { + "entropy": 0.2234116178750992, + "epoch": 8.96103896103896, + "grad_norm": 0.7933406829833984, + "learning_rate": 3.587662654787801e-06, + "loss": 0.13944730758666993, + "mean_token_accuracy": 0.9551076376438141, + "num_tokens": 8797714.0, + "step": 3450 + }, + { + "epoch": 9.0, + "eval_entropy": 0.36579762743069577, + "eval_loss": 0.9979091286659241, + "eval_mean_token_accuracy": 0.8024594216392591, + "eval_num_tokens": 8833950.0, + "eval_runtime": 47.9914, + "eval_samples_per_second": 34.527, + "eval_steps_per_second": 4.334, + "step": 3465 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.8981730000368845e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..34b529d53848971eda3155b95c185289489ac0b8 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.04641649878824187, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..029a90a5d5388bca1d16b8579c3ca64441da31fb --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/trainer_state.json @@ -0,0 +1,115 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 385, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.0450534927845, + "epoch": 0.12987012987012986, + "grad_norm": 0.9224470853805542, + "learning_rate": 1.3970570546126444e-05, + "loss": 1.944740753173828, + "mean_token_accuracy": 0.6158743992447853, + "num_tokens": 133878.0, + "step": 50 + }, + { + "entropy": 1.1079417771101, + "epoch": 0.2597402597402597, + "grad_norm": 0.6827074289321899, + "learning_rate": 2.822625477686771e-05, + "loss": 1.0624147033691407, + "mean_token_accuracy": 0.7396554726362229, + "num_tokens": 265533.0, + "step": 100 + }, + { + "entropy": 0.8689985585212707, + "epoch": 0.38961038961038963, + "grad_norm": 0.6320164203643799, + "learning_rate": 4.248193900760899e-05, + "loss": 0.8256858825683594, + "mean_token_accuracy": 0.7825630265474319, + "num_tokens": 386480.0, + "step": 150 + }, + { + "entropy": 0.7946180325746536, + "epoch": 0.5194805194805194, + "grad_norm": 0.7246975302696228, + "learning_rate": 5.6737623238350247e-05, + "loss": 0.7475, + "mean_token_accuracy": 0.7959125518798829, + "num_tokens": 520135.0, + "step": 200 + }, + { + "entropy": 0.7919283157587051, + "epoch": 0.6493506493506493, + "grad_norm": 0.7635198831558228, + "learning_rate": 7.099330746909153e-05, + "loss": 0.737699966430664, + "mean_token_accuracy": 0.798919832110405, + "num_tokens": 640303.0, + "step": 250 + }, + { + "entropy": 0.7713775283098221, + "epoch": 0.7792207792207793, + "grad_norm": 0.5751814842224121, + "learning_rate": 8.52489916998328e-05, + "loss": 0.7186790466308594, + "mean_token_accuracy": 0.8014196193218232, + "num_tokens": 766027.0, + "step": 300 + }, + { + "entropy": 0.737273331284523, + "epoch": 0.9090909090909091, + "grad_norm": 0.4956009089946747, + "learning_rate": 9.950467593057406e-05, + "loss": 0.6904002380371094, + "mean_token_accuracy": 0.8083628410100937, + "num_tokens": 894871.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.7681069333965962, + "eval_loss": 0.7430074214935303, + "eval_mean_token_accuracy": 0.7941452006881053, + "eval_num_tokens": 981550.0, + "eval_runtime": 47.9814, + "eval_samples_per_second": 34.534, + "eval_steps_per_second": 4.335, + "step": 385 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.429451631126118e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..34b529d53848971eda3155b95c185289489ac0b8 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.04641649878824187, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..263ec3a0d7823b7aadd413fb360c3e1765bbabc3 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/trainer_state.json @@ -0,0 +1,914 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 500, + "global_step": 3850, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.0450534927845, + "epoch": 0.12987012987012986, + "grad_norm": 0.9224470853805542, + "learning_rate": 1.3970570546126444e-05, + "loss": 1.944740753173828, + "mean_token_accuracy": 0.6158743992447853, + "num_tokens": 133878.0, + "step": 50 + }, + { + "entropy": 1.1079417771101, + "epoch": 0.2597402597402597, + "grad_norm": 0.6827074289321899, + "learning_rate": 2.822625477686771e-05, + "loss": 1.0624147033691407, + "mean_token_accuracy": 0.7396554726362229, + "num_tokens": 265533.0, + "step": 100 + }, + { + "entropy": 0.8689985585212707, + "epoch": 0.38961038961038963, + "grad_norm": 0.6320164203643799, + "learning_rate": 4.248193900760899e-05, + "loss": 0.8256858825683594, + "mean_token_accuracy": 0.7825630265474319, + "num_tokens": 386480.0, + "step": 150 + }, + { + "entropy": 0.7946180325746536, + "epoch": 0.5194805194805194, + "grad_norm": 0.7246975302696228, + "learning_rate": 5.6737623238350247e-05, + "loss": 0.7475, + "mean_token_accuracy": 0.7959125518798829, + "num_tokens": 520135.0, + "step": 200 + }, + { + "entropy": 0.7919283157587051, + "epoch": 0.6493506493506493, + "grad_norm": 0.7635198831558228, + "learning_rate": 7.099330746909153e-05, + "loss": 0.737699966430664, + "mean_token_accuracy": 0.798919832110405, + "num_tokens": 640303.0, + "step": 250 + }, + { + "entropy": 0.7713775283098221, + "epoch": 0.7792207792207793, + "grad_norm": 0.5751814842224121, + "learning_rate": 8.52489916998328e-05, + "loss": 0.7186790466308594, + "mean_token_accuracy": 0.8014196193218232, + "num_tokens": 766027.0, + "step": 300 + }, + { + "entropy": 0.737273331284523, + "epoch": 0.9090909090909091, + "grad_norm": 0.4956009089946747, + "learning_rate": 9.950467593057406e-05, + "loss": 0.6904002380371094, + "mean_token_accuracy": 0.8083628410100937, + "num_tokens": 894871.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.7681069333965962, + "eval_loss": 0.7430074214935303, + "eval_mean_token_accuracy": 0.7941452006881053, + "eval_num_tokens": 981550.0, + "eval_runtime": 47.9814, + "eval_samples_per_second": 34.534, + "eval_steps_per_second": 4.335, + "step": 385 + }, + { + "entropy": 0.7167584246397019, + "epoch": 1.0389610389610389, + "grad_norm": 0.638637900352478, + "learning_rate": 0.00010976434715123926, + "loss": 0.6611510467529297, + "mean_token_accuracy": 0.8145374125242233, + "num_tokens": 1021041.0, + "step": 400 + }, + { + "entropy": 0.6893188625574111, + "epoch": 1.1688311688311688, + "grad_norm": 0.554688036441803, + "learning_rate": 0.00010967639449071182, + "loss": 0.6347718811035157, + "mean_token_accuracy": 0.8226255452632905, + "num_tokens": 1149161.0, + "step": 450 + }, + { + "entropy": 0.6728485250473022, + "epoch": 1.2987012987012987, + "grad_norm": 0.5104691386222839, + "learning_rate": 0.00010947585797076785, + "loss": 0.6233362579345703, + "mean_token_accuracy": 0.8227335858345032, + "num_tokens": 1278079.0, + "step": 500 + }, + { + "entropy": 0.6812490409612656, + "epoch": 1.4285714285714286, + "grad_norm": 0.4700304865837097, + "learning_rate": 0.00010916314964373551, + "loss": 0.6289351654052734, + "mean_token_accuracy": 0.8217820060253144, + "num_tokens": 1405326.0, + "step": 550 + }, + { + "entropy": 0.6633523851633072, + "epoch": 1.5584415584415585, + "grad_norm": 0.5094137191772461, + "learning_rate": 0.0001087389120469154, + "loss": 0.6168266296386719, + "mean_token_accuracy": 0.8232935756444931, + "num_tokens": 1532507.0, + "step": 600 + }, + { + "entropy": 0.6454597359895706, + "epoch": 1.6883116883116882, + "grad_norm": 0.40546780824661255, + "learning_rate": 0.00010820401688232725, + "loss": 0.5995024108886718, + "mean_token_accuracy": 0.8284876370429992, + "num_tokens": 1662747.0, + "step": 650 + }, + { + "entropy": 0.6700882267951965, + "epoch": 1.8181818181818183, + "grad_norm": 0.35203394293785095, + "learning_rate": 0.00010755956322558065, + "loss": 0.616350212097168, + "mean_token_accuracy": 0.8246171402931214, + "num_tokens": 1783817.0, + "step": 700 + }, + { + "entropy": 0.6587968200445176, + "epoch": 1.948051948051948, + "grad_norm": 0.5248188376426697, + "learning_rate": 0.00010680687526754984, + "loss": 0.608861198425293, + "mean_token_accuracy": 0.8265628081560135, + "num_tokens": 1914573.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.7101548368541094, + "eval_loss": 0.6869887113571167, + "eval_mean_token_accuracy": 0.8058141409777678, + "eval_num_tokens": 1963100.0, + "eval_runtime": 48.0161, + "eval_samples_per_second": 34.509, + "eval_steps_per_second": 4.332, + "step": 770 + }, + { + "entropy": 0.6272834652662277, + "epoch": 2.0779220779220777, + "grad_norm": 0.5319918990135193, + "learning_rate": 0.00010594749959349313, + "loss": 0.5719264221191406, + "mean_token_accuracy": 0.8345229256153107, + "num_tokens": 2035968.0, + "step": 800 + }, + { + "entropy": 0.599158108830452, + "epoch": 2.207792207792208, + "grad_norm": 0.4296111762523651, + "learning_rate": 0.00010498320200520744, + "loss": 0.5460208129882812, + "mean_token_accuracy": 0.8396762716770172, + "num_tokens": 2168591.0, + "step": 850 + }, + { + "entropy": 0.5994478952884674, + "epoch": 2.3376623376623376, + "grad_norm": 0.3820176422595978, + "learning_rate": 0.00010391596389274791, + "loss": 0.5483282852172852, + "mean_token_accuracy": 0.8385387778282165, + "num_tokens": 2297967.0, + "step": 900 + }, + { + "entropy": 0.5989043036103249, + "epoch": 2.4675324675324677, + "grad_norm": 0.41718003153800964, + "learning_rate": 0.00010274797816316749, + "loss": 0.543673095703125, + "mean_token_accuracy": 0.8396250855922699, + "num_tokens": 2425614.0, + "step": 950 + }, + { + "entropy": 0.6005555561184883, + "epoch": 2.5974025974025974, + "grad_norm": 0.4552708864212036, + "learning_rate": 0.00010148164473464206, + "loss": 0.5505282974243164, + "mean_token_accuracy": 0.8371219438314438, + "num_tokens": 2557421.0, + "step": 1000 + }, + { + "entropy": 0.5972397547960281, + "epoch": 2.7272727272727275, + "grad_norm": 0.45796939730644226, + "learning_rate": 0.00010011956560523972, + "loss": 0.5410661697387695, + "mean_token_accuracy": 0.8398081564903259, + "num_tokens": 2680379.0, + "step": 1050 + }, + { + "entropy": 0.5875487339496612, + "epoch": 2.857142857142857, + "grad_norm": 0.37569308280944824, + "learning_rate": 9.866453950646624e-05, + "loss": 0.537100830078125, + "mean_token_accuracy": 0.8407874500751495, + "num_tokens": 2810854.0, + "step": 1100 + }, + { + "entropy": 0.6040622130036354, + "epoch": 2.987012987012987, + "grad_norm": 0.4184563457965851, + "learning_rate": 9.711955615257278e-05, + "loss": 0.5466165924072266, + "mean_token_accuracy": 0.8411829793453216, + "num_tokens": 2932637.0, + "step": 1150 + }, + { + "epoch": 3.0, + "eval_entropy": 0.6296488522337034, + "eval_loss": 0.671941876411438, + "eval_mean_token_accuracy": 0.8121863231062889, + "eval_num_tokens": 2944650.0, + "eval_runtime": 47.9946, + "eval_samples_per_second": 34.525, + "eval_steps_per_second": 4.334, + "step": 1155 + }, + { + "entropy": 0.543001911342144, + "epoch": 3.116883116883117, + "grad_norm": 0.39842459559440613, + "learning_rate": 9.548779009744178e-05, + "loss": 0.4850382995605469, + "mean_token_accuracy": 0.8531740349531174, + "num_tokens": 3054046.0, + "step": 1200 + }, + { + "entropy": 0.5417884975671768, + "epoch": 3.2467532467532467, + "grad_norm": 0.529087245464325, + "learning_rate": 9.37725942116738e-05, + "loss": 0.48398651123046876, + "mean_token_accuracy": 0.8532969230413436, + "num_tokens": 3182515.0, + "step": 1250 + }, + { + "entropy": 0.5348580208420753, + "epoch": 3.3766233766233764, + "grad_norm": 0.4483303129673004, + "learning_rate": 9.197749279327802e-05, + "loss": 0.4842509078979492, + "mean_token_accuracy": 0.8533288407325744, + "num_tokens": 3313672.0, + "step": 1300 + }, + { + "entropy": 0.5422027057409287, + "epoch": 3.5064935064935066, + "grad_norm": 0.448234885931015, + "learning_rate": 9.010617432612243e-05, + "loss": 0.48615737915039064, + "mean_token_accuracy": 0.8517620968818664, + "num_tokens": 3440689.0, + "step": 1350 + }, + { + "entropy": 0.5518654137849808, + "epoch": 3.6363636363636362, + "grad_norm": 0.5707411170005798, + "learning_rate": 8.816248390102322e-05, + "loss": 0.4946014404296875, + "mean_token_accuracy": 0.8510974669456481, + "num_tokens": 3566427.0, + "step": 1400 + }, + { + "entropy": 0.5268254142999649, + "epoch": 3.7662337662337664, + "grad_norm": 0.42708849906921387, + "learning_rate": 8.615041531504609e-05, + "loss": 0.474882926940918, + "mean_token_accuracy": 0.8555294382572174, + "num_tokens": 3699976.0, + "step": 1450 + }, + { + "entropy": 0.5430487287044525, + "epoch": 3.896103896103896, + "grad_norm": 0.5479393005371094, + "learning_rate": 8.407410286525337e-05, + "loss": 0.48806171417236327, + "mean_token_accuracy": 0.8523763221502304, + "num_tokens": 3827926.0, + "step": 1500 + }, + { + "epoch": 4.0, + "eval_entropy": 0.6002918778417202, + "eval_loss": 0.6801126003265381, + "eval_mean_token_accuracy": 0.8130095520844827, + "eval_num_tokens": 3926200.0, + "eval_runtime": 47.9844, + "eval_samples_per_second": 34.532, + "eval_steps_per_second": 4.335, + "step": 1540 + }, + { + "entropy": 0.5307527387142181, + "epoch": 4.025974025974026, + "grad_norm": 0.5444459915161133, + "learning_rate": 8.193781285375899e-05, + "loss": 0.4741718292236328, + "mean_token_accuracy": 0.8564361107349395, + "num_tokens": 3953109.0, + "step": 1550 + }, + { + "entropy": 0.47564762085676193, + "epoch": 4.1558441558441555, + "grad_norm": 0.6210038065910339, + "learning_rate": 7.974593482154601e-05, + "loss": 0.41448020935058594, + "mean_token_accuracy": 0.8706729990243912, + "num_tokens": 4081079.0, + "step": 1600 + }, + { + "entropy": 0.4731892004609108, + "epoch": 4.285714285714286, + "grad_norm": 0.4910389482975006, + "learning_rate": 7.750297252905916e-05, + "loss": 0.4135689163208008, + "mean_token_accuracy": 0.8708927237987518, + "num_tokens": 4206426.0, + "step": 1650 + }, + { + "entropy": 0.47941224902868274, + "epoch": 4.415584415584416, + "grad_norm": 0.4900703430175781, + "learning_rate": 7.521353470210501e-05, + "loss": 0.4219230270385742, + "mean_token_accuracy": 0.8685608941316605, + "num_tokens": 4329839.0, + "step": 1700 + }, + { + "entropy": 0.48145264506340024, + "epoch": 4.545454545454545, + "grad_norm": 0.4503515064716339, + "learning_rate": 7.288232556207461e-05, + "loss": 0.4248290252685547, + "mean_token_accuracy": 0.8680109107494354, + "num_tokens": 4453940.0, + "step": 1750 + }, + { + "entropy": 0.47785325974226, + "epoch": 4.675324675324675, + "grad_norm": 0.5630834698677063, + "learning_rate": 7.051413515994661e-05, + "loss": 0.4244534683227539, + "mean_token_accuracy": 0.8692984575033188, + "num_tokens": 4583626.0, + "step": 1800 + }, + { + "entropy": 0.4738149631023407, + "epoch": 4.805194805194805, + "grad_norm": 0.5809922814369202, + "learning_rate": 6.811382953393207e-05, + "loss": 0.41768589019775393, + "mean_token_accuracy": 0.870254020690918, + "num_tokens": 4713540.0, + "step": 1850 + }, + { + "entropy": 0.47820780247449873, + "epoch": 4.935064935064935, + "grad_norm": 0.5658873915672302, + "learning_rate": 6.56863407109845e-05, + "loss": 0.42182437896728514, + "mean_token_accuracy": 0.8680490332841874, + "num_tokens": 4845678.0, + "step": 1900 + }, + { + "epoch": 5.0, + "eval_entropy": 0.5540635969776374, + "eval_loss": 0.6924836039543152, + "eval_mean_token_accuracy": 0.8140181821699326, + "eval_num_tokens": 4907750.0, + "eval_runtime": 47.9926, + "eval_samples_per_second": 34.526, + "eval_steps_per_second": 4.334, + "step": 1925 + }, + { + "entropy": 0.4371735429763794, + "epoch": 5.064935064935065, + "grad_norm": 0.5684086680412292, + "learning_rate": 6.323665657271966e-05, + "loss": 0.3749085998535156, + "mean_token_accuracy": 0.8815305006504058, + "num_tokens": 4975955.0, + "step": 1950 + }, + { + "entropy": 0.40612608641386033, + "epoch": 5.194805194805195, + "grad_norm": 0.6515666842460632, + "learning_rate": 6.076981060656787e-05, + "loss": 0.33952392578125, + "mean_token_accuracy": 0.8902835595607758, + "num_tokens": 5104770.0, + "step": 2000 + }, + { + "entropy": 0.41485957682132724, + "epoch": 5.324675324675325, + "grad_norm": 0.5700598359107971, + "learning_rate": 5.829087156321799e-05, + "loss": 0.345616455078125, + "mean_token_accuracy": 0.8897144883871079, + "num_tokens": 5229052.0, + "step": 2050 + }, + { + "entropy": 0.4269444864988327, + "epoch": 5.454545454545454, + "grad_norm": 0.6670826077461243, + "learning_rate": 5.580493304160404e-05, + "loss": 0.35833843231201173, + "mean_token_accuracy": 0.8866234600543976, + "num_tokens": 5357753.0, + "step": 2100 + }, + { + "entropy": 0.41144982814788816, + "epoch": 5.584415584415584, + "grad_norm": 0.620884120464325, + "learning_rate": 5.331710302283492e-05, + "loss": 0.3445538330078125, + "mean_token_accuracy": 0.8895936322212219, + "num_tokens": 5481550.0, + "step": 2150 + }, + { + "entropy": 0.41389220267534255, + "epoch": 5.714285714285714, + "grad_norm": 0.6487388610839844, + "learning_rate": 5.0832493374572605e-05, + "loss": 0.34858001708984376, + "mean_token_accuracy": 0.8874113804101944, + "num_tokens": 5609104.0, + "step": 2200 + }, + { + "entropy": 0.41483602195978164, + "epoch": 5.8441558441558445, + "grad_norm": 0.5946773290634155, + "learning_rate": 4.835620934742408e-05, + "loss": 0.3495229721069336, + "mean_token_accuracy": 0.8887655180692673, + "num_tokens": 5737347.0, + "step": 2250 + }, + { + "entropy": 0.4204313641786575, + "epoch": 5.974025974025974, + "grad_norm": 0.6658430099487305, + "learning_rate": 4.589333908492996e-05, + "loss": 0.3538378143310547, + "mean_token_accuracy": 0.8866806083917618, + "num_tokens": 5861872.0, + "step": 2300 + }, + { + "epoch": 6.0, + "eval_entropy": 0.4912281467650945, + "eval_loss": 0.7537463903427124, + "eval_mean_token_accuracy": 0.8104288898981534, + "eval_num_tokens": 5889300.0, + "eval_runtime": 48.0083, + "eval_samples_per_second": 34.515, + "eval_steps_per_second": 4.333, + "step": 2310 + }, + { + "entropy": 0.3666571286320686, + "epoch": 6.103896103896104, + "grad_norm": 0.6230902671813965, + "learning_rate": 4.344894316870371e-05, + "loss": 0.2813127517700195, + "mean_token_accuracy": 0.9077165073156357, + "num_tokens": 5990679.0, + "step": 2350 + }, + { + "entropy": 0.3391442158818245, + "epoch": 6.233766233766234, + "grad_norm": 0.629700779914856, + "learning_rate": 4.1028044220203685e-05, + "loss": 0.26457656860351564, + "mean_token_accuracy": 0.9139899307489395, + "num_tokens": 6117918.0, + "step": 2400 + }, + { + "entropy": 0.34152675241231917, + "epoch": 6.363636363636363, + "grad_norm": 0.711188793182373, + "learning_rate": 3.863561658050396e-05, + "loss": 0.26950265884399416, + "mean_token_accuracy": 0.9120226174592971, + "num_tokens": 6248566.0, + "step": 2450 + }, + { + "entropy": 0.33951902255415917, + "epoch": 6.4935064935064934, + "grad_norm": 0.7751043438911438, + "learning_rate": 3.627657608926905e-05, + "loss": 0.26502132415771484, + "mean_token_accuracy": 0.9131791013479232, + "num_tokens": 6380529.0, + "step": 2500 + }, + { + "entropy": 0.3548544436693192, + "epoch": 6.623376623376624, + "grad_norm": 0.9152925610542297, + "learning_rate": 3.395576998393457e-05, + "loss": 0.27833885192871094, + "mean_token_accuracy": 0.9090453034639359, + "num_tokens": 6501669.0, + "step": 2550 + }, + { + "entropy": 0.3550854653120041, + "epoch": 6.753246753246753, + "grad_norm": 0.8495270013809204, + "learning_rate": 3.167796693984804e-05, + "loss": 0.27818309783935546, + "mean_token_accuracy": 0.9102006632089615, + "num_tokens": 6624404.0, + "step": 2600 + }, + { + "entropy": 0.34031844735145567, + "epoch": 6.883116883116883, + "grad_norm": 0.6764019727706909, + "learning_rate": 2.9447847271835456e-05, + "loss": 0.26656494140625, + "mean_token_accuracy": 0.9126953399181366, + "num_tokens": 6755532.0, + "step": 2650 + }, + { + "epoch": 7.0, + "eval_entropy": 0.4410387526911039, + "eval_loss": 0.8089802265167236, + "eval_mean_token_accuracy": 0.80925900173875, + "eval_num_tokens": 6870850.0, + "eval_runtime": 47.9562, + "eval_samples_per_second": 34.552, + "eval_steps_per_second": 4.337, + "step": 2695 + }, + { + "entropy": 0.335344740152359, + "epoch": 7.012987012987013, + "grad_norm": 0.6364794373512268, + "learning_rate": 2.7269993317326242e-05, + "loss": 0.25932022094726564, + "mean_token_accuracy": 0.9156477284431458, + "num_tokens": 6884338.0, + "step": 2700 + }, + { + "entropy": 0.283698299229145, + "epoch": 7.142857142857143, + "grad_norm": 0.9388208389282227, + "learning_rate": 2.514888002079755e-05, + "loss": 0.19749004364013673, + "mean_token_accuracy": 0.9353394263982773, + "num_tokens": 7008981.0, + "step": 2750 + }, + { + "entropy": 0.27516734033823015, + "epoch": 7.2727272727272725, + "grad_norm": 0.706503689289093, + "learning_rate": 2.3088865738883814e-05, + "loss": 0.19110334396362305, + "mean_token_accuracy": 0.9379108762741089, + "num_tokens": 7135402.0, + "step": 2800 + }, + { + "entropy": 0.273246209025383, + "epoch": 7.402597402597403, + "grad_norm": 0.7857301235198975, + "learning_rate": 2.1094183285045552e-05, + "loss": 0.19094297409057617, + "mean_token_accuracy": 0.9369927847385406, + "num_tokens": 7265920.0, + "step": 2850 + }, + { + "entropy": 0.2801500430703163, + "epoch": 7.532467532467533, + "grad_norm": 0.7793248891830444, + "learning_rate": 1.9168931232197576e-05, + "loss": 0.19656993865966796, + "mean_token_accuracy": 0.9351688891649246, + "num_tokens": 7389633.0, + "step": 2900 + }, + { + "entropy": 0.27495736733078957, + "epoch": 7.662337662337662, + "grad_norm": 0.8637392520904541, + "learning_rate": 1.7317065491168085e-05, + "loss": 0.1936025810241699, + "mean_token_accuracy": 0.9363743001222611, + "num_tokens": 7518696.0, + "step": 2950 + }, + { + "entropy": 0.2810053497552872, + "epoch": 7.792207792207792, + "grad_norm": 0.7629940509796143, + "learning_rate": 1.554239118229261e-05, + "loss": 0.1976767921447754, + "mean_token_accuracy": 0.9348012053966522, + "num_tokens": 7643525.0, + "step": 3000 + }, + { + "entropy": 0.272479218095541, + "epoch": 7.922077922077922, + "grad_norm": 0.8325297832489014, + "learning_rate": 1.3848554816844692e-05, + "loss": 0.1889443016052246, + "mean_token_accuracy": 0.9384464406967163, + "num_tokens": 7773494.0, + "step": 3050 + }, + { + "epoch": 8.0, + "eval_entropy": 0.4007443663879083, + "eval_loss": 0.8944177031517029, + "eval_mean_token_accuracy": 0.8053252046497968, + "eval_num_tokens": 7852400.0, + "eval_runtime": 47.9727, + "eval_samples_per_second": 34.54, + "eval_steps_per_second": 4.336, + "step": 3080 + }, + { + "entropy": 0.2529501300305128, + "epoch": 8.051948051948052, + "grad_norm": 0.7490427494049072, + "learning_rate": 1.2239036804368287e-05, + "loss": 0.1695658302307129, + "mean_token_accuracy": 0.9450542360544205, + "num_tokens": 7903348.0, + "step": 3100 + }, + { + "entropy": 0.2297460925579071, + "epoch": 8.181818181818182, + "grad_norm": 0.7783088088035583, + "learning_rate": 1.0717144301307847e-05, + "loss": 0.13803850173950194, + "mean_token_accuracy": 0.9550822985172271, + "num_tokens": 8028778.0, + "step": 3150 + }, + { + "entropy": 0.230171779692173, + "epoch": 8.311688311688311, + "grad_norm": 0.8110005259513855, + "learning_rate": 9.286004415629994e-06, + "loss": 0.1422537612915039, + "mean_token_accuracy": 0.9539634013175964, + "num_tokens": 8153699.0, + "step": 3200 + }, + { + "entropy": 0.22410715252161026, + "epoch": 8.441558441558442, + "grad_norm": 0.6850584745407104, + "learning_rate": 7.948557781399818e-06, + "loss": 0.13708532333374024, + "mean_token_accuracy": 0.9560226953029632, + "num_tokens": 8281408.0, + "step": 3250 + }, + { + "entropy": 0.23170628443360328, + "epoch": 8.571428571428571, + "grad_norm": 0.7979664206504822, + "learning_rate": 6.707552516514227e-06, + "loss": 0.14379706382751464, + "mean_token_accuracy": 0.9544130796194077, + "num_tokens": 8400657.0, + "step": 3300 + }, + { + "entropy": 0.2197262801229954, + "epoch": 8.7012987012987, + "grad_norm": 0.6074482202529907, + "learning_rate": 5.565538576007922e-06, + "loss": 0.13595272064208985, + "mean_token_accuracy": 0.9567223310470581, + "num_tokens": 8533080.0, + "step": 3350 + }, + { + "entropy": 0.2189832380414009, + "epoch": 8.831168831168831, + "grad_norm": 0.7105876207351685, + "learning_rate": 4.5248625125343745e-06, + "loss": 0.13385194778442383, + "mean_token_accuracy": 0.9567512100934983, + "num_tokens": 8664922.0, + "step": 3400 + }, + { + "entropy": 0.2234116178750992, + "epoch": 8.96103896103896, + "grad_norm": 0.7933406829833984, + "learning_rate": 3.587662654787801e-06, + "loss": 0.13944730758666993, + "mean_token_accuracy": 0.9551076376438141, + "num_tokens": 8797714.0, + "step": 3450 + }, + { + "epoch": 9.0, + "eval_entropy": 0.36579762743069577, + "eval_loss": 0.9979091286659241, + "eval_mean_token_accuracy": 0.8024594216392591, + "eval_num_tokens": 8833950.0, + "eval_runtime": 47.9914, + "eval_samples_per_second": 34.527, + "eval_steps_per_second": 4.334, + "step": 3465 + }, + { + "entropy": 0.21086626052856444, + "epoch": 9.090909090909092, + "grad_norm": 0.7791101336479187, + "learning_rate": 2.7558647137731255e-06, + "loss": 0.122357816696167, + "mean_token_accuracy": 0.962859439253807, + "num_tokens": 8925069.0, + "step": 3500 + }, + { + "entropy": 0.19901559188961981, + "epoch": 9.220779220779221, + "grad_norm": 0.6296378970146179, + "learning_rate": 2.0311778259521985e-06, + "loss": 0.11158108711242676, + "mean_token_accuracy": 0.9652415263652802, + "num_tokens": 9056953.0, + "step": 3550 + }, + { + "entropy": 0.2004897651076317, + "epoch": 9.35064935064935, + "grad_norm": 0.6992365121841431, + "learning_rate": 1.4150910413963161e-06, + "loss": 0.1120915412902832, + "mean_token_accuracy": 0.9652277189493179, + "num_tokens": 9186990.0, + "step": 3600 + }, + { + "entropy": 0.20551898300647736, + "epoch": 9.480519480519481, + "grad_norm": 0.6944624781608582, + "learning_rate": 9.088702641613061e-07, + "loss": 0.11612151145935058, + "mean_token_accuracy": 0.9643786966800689, + "num_tokens": 9311897.0, + "step": 3650 + }, + { + "entropy": 0.20479844331741334, + "epoch": 9.61038961038961, + "grad_norm": 0.7582993507385254, + "learning_rate": 5.135556511716324e-07, + "loss": 0.11483741760253906, + "mean_token_accuracy": 0.9642562127113342, + "num_tokens": 9434396.0, + "step": 3700 + }, + { + "entropy": 0.20504073575139045, + "epoch": 9.74025974025974, + "grad_norm": 0.7992972135543823, + "learning_rate": 2.299594749584497e-07, + "loss": 0.11569642066955567, + "mean_token_accuracy": 0.9646343672275544, + "num_tokens": 9560766.0, + "step": 3750 + }, + { + "entropy": 0.20401015728712082, + "epoch": 9.87012987012987, + "grad_norm": 0.8295965790748596, + "learning_rate": 5.866445464296065e-08, + "loss": 0.11431631088256836, + "mean_token_accuracy": 0.9642905777692795, + "num_tokens": 9686573.0, + "step": 3800 + }, + { + "entropy": 0.2005618315190077, + "epoch": 10.0, + "grad_norm": 0.9638449549674988, + "learning_rate": 2.255859454874737e-11, + "loss": 0.11225462913513183, + "mean_token_accuracy": 0.9657586789131165, + "num_tokens": 9815500.0, + "step": 3850 + }, + { + "epoch": 10.0, + "eval_entropy": 0.3506911682824676, + "eval_loss": 1.0614067316055298, + "eval_mean_token_accuracy": 0.8005969426952876, + "eval_num_tokens": 9815500.0, + "eval_runtime": 47.9746, + "eval_samples_per_second": 34.539, + "eval_steps_per_second": 4.336, + "step": 3850 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.436930629170852e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..34b529d53848971eda3155b95c185289489ac0b8 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.04641649878824187, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9a50918f7a56462b63b56fd132799a700284faa1 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/trainer_state.json @@ -0,0 +1,206 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 770, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.0450534927845, + "epoch": 0.12987012987012986, + "grad_norm": 0.9224470853805542, + "learning_rate": 1.3970570546126444e-05, + "loss": 1.944740753173828, + "mean_token_accuracy": 0.6158743992447853, + "num_tokens": 133878.0, + "step": 50 + }, + { + "entropy": 1.1079417771101, + "epoch": 0.2597402597402597, + "grad_norm": 0.6827074289321899, + "learning_rate": 2.822625477686771e-05, + "loss": 1.0624147033691407, + "mean_token_accuracy": 0.7396554726362229, + "num_tokens": 265533.0, + "step": 100 + }, + { + "entropy": 0.8689985585212707, + "epoch": 0.38961038961038963, + "grad_norm": 0.6320164203643799, + "learning_rate": 4.248193900760899e-05, + "loss": 0.8256858825683594, + "mean_token_accuracy": 0.7825630265474319, + "num_tokens": 386480.0, + "step": 150 + }, + { + "entropy": 0.7946180325746536, + "epoch": 0.5194805194805194, + "grad_norm": 0.7246975302696228, + "learning_rate": 5.6737623238350247e-05, + "loss": 0.7475, + "mean_token_accuracy": 0.7959125518798829, + "num_tokens": 520135.0, + "step": 200 + }, + { + "entropy": 0.7919283157587051, + "epoch": 0.6493506493506493, + "grad_norm": 0.7635198831558228, + "learning_rate": 7.099330746909153e-05, + "loss": 0.737699966430664, + "mean_token_accuracy": 0.798919832110405, + "num_tokens": 640303.0, + "step": 250 + }, + { + "entropy": 0.7713775283098221, + "epoch": 0.7792207792207793, + "grad_norm": 0.5751814842224121, + "learning_rate": 8.52489916998328e-05, + "loss": 0.7186790466308594, + "mean_token_accuracy": 0.8014196193218232, + "num_tokens": 766027.0, + "step": 300 + }, + { + "entropy": 0.737273331284523, + "epoch": 0.9090909090909091, + "grad_norm": 0.4956009089946747, + "learning_rate": 9.950467593057406e-05, + "loss": 0.6904002380371094, + "mean_token_accuracy": 0.8083628410100937, + "num_tokens": 894871.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.7681069333965962, + "eval_loss": 0.7430074214935303, + "eval_mean_token_accuracy": 0.7941452006881053, + "eval_num_tokens": 981550.0, + "eval_runtime": 47.9814, + "eval_samples_per_second": 34.534, + "eval_steps_per_second": 4.335, + "step": 385 + }, + { + "entropy": 0.7167584246397019, + "epoch": 1.0389610389610389, + "grad_norm": 0.638637900352478, + "learning_rate": 0.00010976434715123926, + "loss": 0.6611510467529297, + "mean_token_accuracy": 0.8145374125242233, + "num_tokens": 1021041.0, + "step": 400 + }, + { + "entropy": 0.6893188625574111, + "epoch": 1.1688311688311688, + "grad_norm": 0.554688036441803, + "learning_rate": 0.00010967639449071182, + "loss": 0.6347718811035157, + "mean_token_accuracy": 0.8226255452632905, + "num_tokens": 1149161.0, + "step": 450 + }, + { + "entropy": 0.6728485250473022, + "epoch": 1.2987012987012987, + "grad_norm": 0.5104691386222839, + "learning_rate": 0.00010947585797076785, + "loss": 0.6233362579345703, + "mean_token_accuracy": 0.8227335858345032, + "num_tokens": 1278079.0, + "step": 500 + }, + { + "entropy": 0.6812490409612656, + "epoch": 1.4285714285714286, + "grad_norm": 0.4700304865837097, + "learning_rate": 0.00010916314964373551, + "loss": 0.6289351654052734, + "mean_token_accuracy": 0.8217820060253144, + "num_tokens": 1405326.0, + "step": 550 + }, + { + "entropy": 0.6633523851633072, + "epoch": 1.5584415584415585, + "grad_norm": 0.5094137191772461, + "learning_rate": 0.0001087389120469154, + "loss": 0.6168266296386719, + "mean_token_accuracy": 0.8232935756444931, + "num_tokens": 1532507.0, + "step": 600 + }, + { + "entropy": 0.6454597359895706, + "epoch": 1.6883116883116882, + "grad_norm": 0.40546780824661255, + "learning_rate": 0.00010820401688232725, + "loss": 0.5995024108886718, + "mean_token_accuracy": 0.8284876370429992, + "num_tokens": 1662747.0, + "step": 650 + }, + { + "entropy": 0.6700882267951965, + "epoch": 1.8181818181818183, + "grad_norm": 0.35203394293785095, + "learning_rate": 0.00010755956322558065, + "loss": 0.616350212097168, + "mean_token_accuracy": 0.8246171402931214, + "num_tokens": 1783817.0, + "step": 700 + }, + { + "entropy": 0.6587968200445176, + "epoch": 1.948051948051948, + "grad_norm": 0.5248188376426697, + "learning_rate": 0.00010680687526754984, + "loss": 0.608861198425293, + "mean_token_accuracy": 0.8265628081560135, + "num_tokens": 1914573.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.7101548368541094, + "eval_loss": 0.6869887113571167, + "eval_mean_token_accuracy": 0.8058141409777678, + "eval_num_tokens": 1963100.0, + "eval_runtime": 48.0161, + "eval_samples_per_second": 34.509, + "eval_steps_per_second": 4.332, + "step": 770 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0934953976464589e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1e1c4b96e90acaf89742a04a298690de735bcc46 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/README.md @@ -0,0 +1,58 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: transformers +model_name: Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2 +tags: +- generated_from_trainer +- trl +- sft +licence: license +--- + +# Model Card for Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2 + +This model is a fine-tuned version of [Qwen/Qwen3-4B-Base](https://huggingface.co/Qwen/Qwen3-4B-Base). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/katriin-kukk/Cross_lingual_morphological_generalization/runs/6okpbvfx) + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.29.0 +- Transformers: 5.5.4 +- Pytorch: 2.10.0 +- Datasets: 4.6.1 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e4d42893a69e354223672a3b0f9de6f2b979c004 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05284766149829996, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..01b67b69b00a195e00981eca9a4433d8d03e122d --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/trainer_state.json @@ -0,0 +1,297 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1155, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.8529596650600433, + "epoch": 0.12987012987012986, + "grad_norm": 0.878183901309967, + "learning_rate": 2.8227544379109735e-05, + "loss": 1.762685546875, + "mean_token_accuracy": 0.6396001759171486, + "num_tokens": 133878.0, + "step": 50 + }, + { + "entropy": 0.9309259909391403, + "epoch": 0.2597402597402597, + "grad_norm": 0.6279756426811218, + "learning_rate": 5.7031161092487016e-05, + "loss": 0.8922532653808594, + "mean_token_accuracy": 0.768746777176857, + "num_tokens": 265533.0, + "step": 100 + }, + { + "entropy": 0.8268384379148483, + "epoch": 0.38961038961038963, + "grad_norm": 0.5478948354721069, + "learning_rate": 8.583477780586432e-05, + "loss": 0.7756452178955078, + "mean_token_accuracy": 0.7925205600261688, + "num_tokens": 386480.0, + "step": 150 + }, + { + "entropy": 0.7658095824718475, + "epoch": 0.5194805194805194, + "grad_norm": 0.5972853899002075, + "learning_rate": 0.00011463839451924158, + "loss": 0.7161135101318359, + "mean_token_accuracy": 0.8019153982400894, + "num_tokens": 520135.0, + "step": 200 + }, + { + "entropy": 0.769344270825386, + "epoch": 0.6493506493506493, + "grad_norm": 0.5576531887054443, + "learning_rate": 0.0001434420112326189, + "loss": 0.716611557006836, + "mean_token_accuracy": 0.8028716742992401, + "num_tokens": 640303.0, + "step": 250 + }, + { + "entropy": 0.7470700722932816, + "epoch": 0.7792207792207793, + "grad_norm": 0.3937978148460388, + "learning_rate": 0.0001722456279459962, + "loss": 0.6983388519287109, + "mean_token_accuracy": 0.80596988260746, + "num_tokens": 766027.0, + "step": 300 + }, + { + "entropy": 0.7172233510017395, + "epoch": 0.9090909090909091, + "grad_norm": 0.32564300298690796, + "learning_rate": 0.00020104924465937345, + "loss": 0.6731016540527344, + "mean_token_accuracy": 0.8119878542423248, + "num_tokens": 894871.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.7397788639825124, + "eval_loss": 0.7304292917251587, + "eval_mean_token_accuracy": 0.7976073359067624, + "eval_num_tokens": 981550.0, + "eval_runtime": 46.5688, + "eval_samples_per_second": 35.582, + "eval_steps_per_second": 4.467, + "step": 385 + }, + { + "entropy": 0.7035581320524216, + "epoch": 1.0389610389610389, + "grad_norm": 0.45747122168540955, + "learning_rate": 0.00022177891520076015, + "loss": 0.6450813293457032, + "mean_token_accuracy": 0.8178210550546646, + "num_tokens": 1021041.0, + "step": 400 + }, + { + "entropy": 0.6679977881908417, + "epoch": 1.1688311688311688, + "grad_norm": 0.4456019401550293, + "learning_rate": 0.0002216012068086725, + "loss": 0.6189227294921875, + "mean_token_accuracy": 0.8257622331380844, + "num_tokens": 1149161.0, + "step": 450 + }, + { + "entropy": 0.6574469250440598, + "epoch": 1.2987012987012987, + "grad_norm": 0.3784359097480774, + "learning_rate": 0.00022119602267552194, + "loss": 0.6098381805419922, + "mean_token_accuracy": 0.8251238936185836, + "num_tokens": 1278079.0, + "step": 500 + }, + { + "entropy": 0.6593573099374771, + "epoch": 1.4285714285714286, + "grad_norm": 0.36508747935295105, + "learning_rate": 0.00022056419535323196, + "loss": 0.6137563705444335, + "mean_token_accuracy": 0.824503253698349, + "num_tokens": 1405326.0, + "step": 550 + }, + { + "entropy": 0.6459340593218803, + "epoch": 1.5584415584415585, + "grad_norm": 0.39773380756378174, + "learning_rate": 0.00021970702308872148, + "loss": 0.5986767196655274, + "mean_token_accuracy": 0.8270765203237533, + "num_tokens": 1532507.0, + "step": 600 + }, + { + "entropy": 0.633097175359726, + "epoch": 1.6883116883116882, + "grad_norm": 0.41731834411621094, + "learning_rate": 0.00021862626715633265, + "loss": 0.5849922180175782, + "mean_token_accuracy": 0.8313153618574143, + "num_tokens": 1662747.0, + "step": 650 + }, + { + "entropy": 0.6569280475378036, + "epoch": 1.8181818181818183, + "grad_norm": 0.3067665100097656, + "learning_rate": 0.00021732414823885307, + "loss": 0.605062370300293, + "mean_token_accuracy": 0.8269566106796264, + "num_tokens": 1783817.0, + "step": 700 + }, + { + "entropy": 0.6459099870920181, + "epoch": 1.948051948051948, + "grad_norm": 0.5508913397789001, + "learning_rate": 0.00021580334186456886, + "loss": 0.596672134399414, + "mean_token_accuracy": 0.8287447422742844, + "num_tokens": 1914573.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.6985598478752834, + "eval_loss": 0.6716415286064148, + "eval_mean_token_accuracy": 0.808078097895934, + "eval_num_tokens": 1963100.0, + "eval_runtime": 46.5197, + "eval_samples_per_second": 35.619, + "eval_steps_per_second": 4.471, + "step": 770 + }, + { + "entropy": 0.6113292586803436, + "epoch": 2.0779220779220777, + "grad_norm": 0.37614893913269043, + "learning_rate": 0.00021406697290972404, + "loss": 0.5562425231933594, + "mean_token_accuracy": 0.8369060623645782, + "num_tokens": 2035968.0, + "step": 800 + }, + { + "entropy": 0.5720089420676231, + "epoch": 2.207792207792208, + "grad_norm": 0.35560500621795654, + "learning_rate": 0.00021211860917768236, + "loss": 0.521314697265625, + "mean_token_accuracy": 0.8453685063123703, + "num_tokens": 2168591.0, + "step": 850 + }, + { + "entropy": 0.5757493850588798, + "epoch": 2.3376623376623376, + "grad_norm": 0.304592490196228, + "learning_rate": 0.00020996225406798486, + "loss": 0.5258681106567383, + "mean_token_accuracy": 0.8431038129329681, + "num_tokens": 2297967.0, + "step": 900 + }, + { + "entropy": 0.5790414094924927, + "epoch": 2.4675324675324677, + "grad_norm": 0.29762616753578186, + "learning_rate": 0.00020760233835036664, + "loss": 0.5219763565063477, + "mean_token_accuracy": 0.8438457292318344, + "num_tokens": 2425614.0, + "step": 950 + }, + { + "entropy": 0.5783660891652107, + "epoch": 2.5974025974025974, + "grad_norm": 0.3418065011501312, + "learning_rate": 0.00020504371106063417, + "loss": 0.5258687210083007, + "mean_token_accuracy": 0.8422278153896332, + "num_tokens": 2557421.0, + "step": 1000 + }, + { + "entropy": 0.5746926316618919, + "epoch": 2.7272727272727275, + "grad_norm": 0.3708217740058899, + "learning_rate": 0.00020229162953711157, + "loss": 0.5161260223388672, + "mean_token_accuracy": 0.8453844922780991, + "num_tokens": 2680379.0, + "step": 1050 + }, + { + "entropy": 0.5676264691352845, + "epoch": 2.857142857142857, + "grad_norm": 0.26185691356658936, + "learning_rate": 0.00019935174861812654, + "loss": 0.5179851913452148, + "mean_token_accuracy": 0.8455151951313019, + "num_tokens": 2810854.0, + "step": 1100 + }, + { + "entropy": 0.5804974803328514, + "epoch": 2.987012987012987, + "grad_norm": 0.3131248652935028, + "learning_rate": 0.00019623010902273397, + "loss": 0.5243957138061524, + "mean_token_accuracy": 0.8455932134389877, + "num_tokens": 2932637.0, + "step": 1150 + }, + { + "epoch": 3.0, + "eval_entropy": 0.6151215266436338, + "eval_loss": 0.6557295918464661, + "eval_mean_token_accuracy": 0.8159411993737404, + "eval_num_tokens": 2944650.0, + "eval_runtime": 46.5152, + "eval_samples_per_second": 35.623, + "eval_steps_per_second": 4.472, + "step": 1155 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.637823526955428e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e4d42893a69e354223672a3b0f9de6f2b979c004 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05284766149829996, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ae0991113444e15d0f169b7fc0963db878251aba --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/trainer_state.json @@ -0,0 +1,378 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 1540, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.8529596650600433, + "epoch": 0.12987012987012986, + "grad_norm": 0.878183901309967, + "learning_rate": 2.8227544379109735e-05, + "loss": 1.762685546875, + "mean_token_accuracy": 0.6396001759171486, + "num_tokens": 133878.0, + "step": 50 + }, + { + "entropy": 0.9309259909391403, + "epoch": 0.2597402597402597, + "grad_norm": 0.6279756426811218, + "learning_rate": 5.7031161092487016e-05, + "loss": 0.8922532653808594, + "mean_token_accuracy": 0.768746777176857, + "num_tokens": 265533.0, + "step": 100 + }, + { + "entropy": 0.8268384379148483, + "epoch": 0.38961038961038963, + "grad_norm": 0.5478948354721069, + "learning_rate": 8.583477780586432e-05, + "loss": 0.7756452178955078, + "mean_token_accuracy": 0.7925205600261688, + "num_tokens": 386480.0, + "step": 150 + }, + { + "entropy": 0.7658095824718475, + "epoch": 0.5194805194805194, + "grad_norm": 0.5972853899002075, + "learning_rate": 0.00011463839451924158, + "loss": 0.7161135101318359, + "mean_token_accuracy": 0.8019153982400894, + "num_tokens": 520135.0, + "step": 200 + }, + { + "entropy": 0.769344270825386, + "epoch": 0.6493506493506493, + "grad_norm": 0.5576531887054443, + "learning_rate": 0.0001434420112326189, + "loss": 0.716611557006836, + "mean_token_accuracy": 0.8028716742992401, + "num_tokens": 640303.0, + "step": 250 + }, + { + "entropy": 0.7470700722932816, + "epoch": 0.7792207792207793, + "grad_norm": 0.3937978148460388, + "learning_rate": 0.0001722456279459962, + "loss": 0.6983388519287109, + "mean_token_accuracy": 0.80596988260746, + "num_tokens": 766027.0, + "step": 300 + }, + { + "entropy": 0.7172233510017395, + "epoch": 0.9090909090909091, + "grad_norm": 0.32564300298690796, + "learning_rate": 0.00020104924465937345, + "loss": 0.6731016540527344, + "mean_token_accuracy": 0.8119878542423248, + "num_tokens": 894871.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.7397788639825124, + "eval_loss": 0.7304292917251587, + "eval_mean_token_accuracy": 0.7976073359067624, + "eval_num_tokens": 981550.0, + "eval_runtime": 46.5688, + "eval_samples_per_second": 35.582, + "eval_steps_per_second": 4.467, + "step": 385 + }, + { + "entropy": 0.7035581320524216, + "epoch": 1.0389610389610389, + "grad_norm": 0.45747122168540955, + "learning_rate": 0.00022177891520076015, + "loss": 0.6450813293457032, + "mean_token_accuracy": 0.8178210550546646, + "num_tokens": 1021041.0, + "step": 400 + }, + { + "entropy": 0.6679977881908417, + "epoch": 1.1688311688311688, + "grad_norm": 0.4456019401550293, + "learning_rate": 0.0002216012068086725, + "loss": 0.6189227294921875, + "mean_token_accuracy": 0.8257622331380844, + "num_tokens": 1149161.0, + "step": 450 + }, + { + "entropy": 0.6574469250440598, + "epoch": 1.2987012987012987, + "grad_norm": 0.3784359097480774, + "learning_rate": 0.00022119602267552194, + "loss": 0.6098381805419922, + "mean_token_accuracy": 0.8251238936185836, + "num_tokens": 1278079.0, + "step": 500 + }, + { + "entropy": 0.6593573099374771, + "epoch": 1.4285714285714286, + "grad_norm": 0.36508747935295105, + "learning_rate": 0.00022056419535323196, + "loss": 0.6137563705444335, + "mean_token_accuracy": 0.824503253698349, + "num_tokens": 1405326.0, + "step": 550 + }, + { + "entropy": 0.6459340593218803, + "epoch": 1.5584415584415585, + "grad_norm": 0.39773380756378174, + "learning_rate": 0.00021970702308872148, + "loss": 0.5986767196655274, + "mean_token_accuracy": 0.8270765203237533, + "num_tokens": 1532507.0, + "step": 600 + }, + { + "entropy": 0.633097175359726, + "epoch": 1.6883116883116882, + "grad_norm": 0.41731834411621094, + "learning_rate": 0.00021862626715633265, + "loss": 0.5849922180175782, + "mean_token_accuracy": 0.8313153618574143, + "num_tokens": 1662747.0, + "step": 650 + }, + { + "entropy": 0.6569280475378036, + "epoch": 1.8181818181818183, + "grad_norm": 0.3067665100097656, + "learning_rate": 0.00021732414823885307, + "loss": 0.605062370300293, + "mean_token_accuracy": 0.8269566106796264, + "num_tokens": 1783817.0, + "step": 700 + }, + { + "entropy": 0.6459099870920181, + "epoch": 1.948051948051948, + "grad_norm": 0.5508913397789001, + "learning_rate": 0.00021580334186456886, + "loss": 0.596672134399414, + "mean_token_accuracy": 0.8287447422742844, + "num_tokens": 1914573.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.6985598478752834, + "eval_loss": 0.6716415286064148, + "eval_mean_token_accuracy": 0.808078097895934, + "eval_num_tokens": 1963100.0, + "eval_runtime": 46.5197, + "eval_samples_per_second": 35.619, + "eval_steps_per_second": 4.471, + "step": 770 + }, + { + "entropy": 0.6113292586803436, + "epoch": 2.0779220779220777, + "grad_norm": 0.37614893913269043, + "learning_rate": 0.00021406697290972404, + "loss": 0.5562425231933594, + "mean_token_accuracy": 0.8369060623645782, + "num_tokens": 2035968.0, + "step": 800 + }, + { + "entropy": 0.5720089420676231, + "epoch": 2.207792207792208, + "grad_norm": 0.35560500621795654, + "learning_rate": 0.00021211860917768236, + "loss": 0.521314697265625, + "mean_token_accuracy": 0.8453685063123703, + "num_tokens": 2168591.0, + "step": 850 + }, + { + "entropy": 0.5757493850588798, + "epoch": 2.3376623376623376, + "grad_norm": 0.304592490196228, + "learning_rate": 0.00020996225406798486, + "loss": 0.5258681106567383, + "mean_token_accuracy": 0.8431038129329681, + "num_tokens": 2297967.0, + "step": 900 + }, + { + "entropy": 0.5790414094924927, + "epoch": 2.4675324675324677, + "grad_norm": 0.29762616753578186, + "learning_rate": 0.00020760233835036664, + "loss": 0.5219763565063477, + "mean_token_accuracy": 0.8438457292318344, + "num_tokens": 2425614.0, + "step": 950 + }, + { + "entropy": 0.5783660891652107, + "epoch": 2.5974025974025974, + "grad_norm": 0.3418065011501312, + "learning_rate": 0.00020504371106063417, + "loss": 0.5258687210083007, + "mean_token_accuracy": 0.8422278153896332, + "num_tokens": 2557421.0, + "step": 1000 + }, + { + "entropy": 0.5746926316618919, + "epoch": 2.7272727272727275, + "grad_norm": 0.3708217740058899, + "learning_rate": 0.00020229162953711157, + "loss": 0.5161260223388672, + "mean_token_accuracy": 0.8453844922780991, + "num_tokens": 2680379.0, + "step": 1050 + }, + { + "entropy": 0.5676264691352845, + "epoch": 2.857142857142857, + "grad_norm": 0.26185691356658936, + "learning_rate": 0.00019935174861812654, + "loss": 0.5179851913452148, + "mean_token_accuracy": 0.8455151951313019, + "num_tokens": 2810854.0, + "step": 1100 + }, + { + "entropy": 0.5804974803328514, + "epoch": 2.987012987012987, + "grad_norm": 0.3131248652935028, + "learning_rate": 0.00019623010902273397, + "loss": 0.5243957138061524, + "mean_token_accuracy": 0.8455932134389877, + "num_tokens": 2932637.0, + "step": 1150 + }, + { + "epoch": 3.0, + "eval_entropy": 0.6151215266436338, + "eval_loss": 0.6557295918464661, + "eval_mean_token_accuracy": 0.8159411993737404, + "eval_num_tokens": 2944650.0, + "eval_runtime": 46.5152, + "eval_samples_per_second": 35.623, + "eval_steps_per_second": 4.472, + "step": 1155 + }, + { + "entropy": 0.5100037640333176, + "epoch": 3.116883116883117, + "grad_norm": 0.32568028569221497, + "learning_rate": 0.00019293312493855094, + "loss": 0.4522856140136719, + "mean_token_accuracy": 0.8600019490718842, + "num_tokens": 3054046.0, + "step": 1200 + }, + { + "entropy": 0.5058341425657272, + "epoch": 3.2467532467532467, + "grad_norm": 0.3810749650001526, + "learning_rate": 0.00018946757084220762, + "loss": 0.44891536712646485, + "mean_token_accuracy": 0.8606845206022262, + "num_tokens": 3182515.0, + "step": 1250 + }, + { + "entropy": 0.5006153827905655, + "epoch": 3.3766233766233764, + "grad_norm": 0.35473746061325073, + "learning_rate": 0.0001858405675794941, + "loss": 0.451065673828125, + "mean_token_accuracy": 0.8608392387628555, + "num_tokens": 3313672.0, + "step": 1300 + }, + { + "entropy": 0.508549126982689, + "epoch": 3.5064935064935066, + "grad_norm": 0.3414103388786316, + "learning_rate": 0.00018205956773380578, + "loss": 0.4535030746459961, + "mean_token_accuracy": 0.859148946404457, + "num_tokens": 3440689.0, + "step": 1350 + }, + { + "entropy": 0.516265479028225, + "epoch": 3.6363636363636362, + "grad_norm": 0.4597613513469696, + "learning_rate": 0.00017813234031295068, + "loss": 0.45882129669189453, + "mean_token_accuracy": 0.8582911169528962, + "num_tokens": 3566427.0, + "step": 1400 + }, + { + "entropy": 0.4902383416891098, + "epoch": 3.7662337662337664, + "grad_norm": 0.3385583460330963, + "learning_rate": 0.0001740669547857841, + "loss": 0.4417523193359375, + "mean_token_accuracy": 0.8632881045341492, + "num_tokens": 3699976.0, + "step": 1450 + }, + { + "entropy": 0.5104234129190445, + "epoch": 3.896103896103896, + "grad_norm": 0.3935626149177551, + "learning_rate": 0.00016987176450147088, + "loss": 0.4547672653198242, + "mean_token_accuracy": 0.8592299193143844, + "num_tokens": 3827926.0, + "step": 1500 + }, + { + "epoch": 4.0, + "eval_entropy": 0.565094108478381, + "eval_loss": 0.6673083305358887, + "eval_mean_token_accuracy": 0.8179781915476689, + "eval_num_tokens": 3926200.0, + "eval_runtime": 46.4987, + "eval_samples_per_second": 35.635, + "eval_steps_per_second": 4.473, + "step": 1540 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.1769399455551386e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e4d42893a69e354223672a3b0f9de6f2b979c004 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05284766149829996, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..23a49374c4be1f93d27cf6023f58c78f7c553b38 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/trainer_state.json @@ -0,0 +1,469 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 1925, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.8529596650600433, + "epoch": 0.12987012987012986, + "grad_norm": 0.878183901309967, + "learning_rate": 2.8227544379109735e-05, + "loss": 1.762685546875, + "mean_token_accuracy": 0.6396001759171486, + "num_tokens": 133878.0, + "step": 50 + }, + { + "entropy": 0.9309259909391403, + "epoch": 0.2597402597402597, + "grad_norm": 0.6279756426811218, + "learning_rate": 5.7031161092487016e-05, + "loss": 0.8922532653808594, + "mean_token_accuracy": 0.768746777176857, + "num_tokens": 265533.0, + "step": 100 + }, + { + "entropy": 0.8268384379148483, + "epoch": 0.38961038961038963, + "grad_norm": 0.5478948354721069, + "learning_rate": 8.583477780586432e-05, + "loss": 0.7756452178955078, + "mean_token_accuracy": 0.7925205600261688, + "num_tokens": 386480.0, + "step": 150 + }, + { + "entropy": 0.7658095824718475, + "epoch": 0.5194805194805194, + "grad_norm": 0.5972853899002075, + "learning_rate": 0.00011463839451924158, + "loss": 0.7161135101318359, + "mean_token_accuracy": 0.8019153982400894, + "num_tokens": 520135.0, + "step": 200 + }, + { + "entropy": 0.769344270825386, + "epoch": 0.6493506493506493, + "grad_norm": 0.5576531887054443, + "learning_rate": 0.0001434420112326189, + "loss": 0.716611557006836, + "mean_token_accuracy": 0.8028716742992401, + "num_tokens": 640303.0, + "step": 250 + }, + { + "entropy": 0.7470700722932816, + "epoch": 0.7792207792207793, + "grad_norm": 0.3937978148460388, + "learning_rate": 0.0001722456279459962, + "loss": 0.6983388519287109, + "mean_token_accuracy": 0.80596988260746, + "num_tokens": 766027.0, + "step": 300 + }, + { + "entropy": 0.7172233510017395, + "epoch": 0.9090909090909091, + "grad_norm": 0.32564300298690796, + "learning_rate": 0.00020104924465937345, + "loss": 0.6731016540527344, + "mean_token_accuracy": 0.8119878542423248, + "num_tokens": 894871.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.7397788639825124, + "eval_loss": 0.7304292917251587, + "eval_mean_token_accuracy": 0.7976073359067624, + "eval_num_tokens": 981550.0, + "eval_runtime": 46.5688, + "eval_samples_per_second": 35.582, + "eval_steps_per_second": 4.467, + "step": 385 + }, + { + "entropy": 0.7035581320524216, + "epoch": 1.0389610389610389, + "grad_norm": 0.45747122168540955, + "learning_rate": 0.00022177891520076015, + "loss": 0.6450813293457032, + "mean_token_accuracy": 0.8178210550546646, + "num_tokens": 1021041.0, + "step": 400 + }, + { + "entropy": 0.6679977881908417, + "epoch": 1.1688311688311688, + "grad_norm": 0.4456019401550293, + "learning_rate": 0.0002216012068086725, + "loss": 0.6189227294921875, + "mean_token_accuracy": 0.8257622331380844, + "num_tokens": 1149161.0, + "step": 450 + }, + { + "entropy": 0.6574469250440598, + "epoch": 1.2987012987012987, + "grad_norm": 0.3784359097480774, + "learning_rate": 0.00022119602267552194, + "loss": 0.6098381805419922, + "mean_token_accuracy": 0.8251238936185836, + "num_tokens": 1278079.0, + "step": 500 + }, + { + "entropy": 0.6593573099374771, + "epoch": 1.4285714285714286, + "grad_norm": 0.36508747935295105, + "learning_rate": 0.00022056419535323196, + "loss": 0.6137563705444335, + "mean_token_accuracy": 0.824503253698349, + "num_tokens": 1405326.0, + "step": 550 + }, + { + "entropy": 0.6459340593218803, + "epoch": 1.5584415584415585, + "grad_norm": 0.39773380756378174, + "learning_rate": 0.00021970702308872148, + "loss": 0.5986767196655274, + "mean_token_accuracy": 0.8270765203237533, + "num_tokens": 1532507.0, + "step": 600 + }, + { + "entropy": 0.633097175359726, + "epoch": 1.6883116883116882, + "grad_norm": 0.41731834411621094, + "learning_rate": 0.00021862626715633265, + "loss": 0.5849922180175782, + "mean_token_accuracy": 0.8313153618574143, + "num_tokens": 1662747.0, + "step": 650 + }, + { + "entropy": 0.6569280475378036, + "epoch": 1.8181818181818183, + "grad_norm": 0.3067665100097656, + "learning_rate": 0.00021732414823885307, + "loss": 0.605062370300293, + "mean_token_accuracy": 0.8269566106796264, + "num_tokens": 1783817.0, + "step": 700 + }, + { + "entropy": 0.6459099870920181, + "epoch": 1.948051948051948, + "grad_norm": 0.5508913397789001, + "learning_rate": 0.00021580334186456886, + "loss": 0.596672134399414, + "mean_token_accuracy": 0.8287447422742844, + "num_tokens": 1914573.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.6985598478752834, + "eval_loss": 0.6716415286064148, + "eval_mean_token_accuracy": 0.808078097895934, + "eval_num_tokens": 1963100.0, + "eval_runtime": 46.5197, + "eval_samples_per_second": 35.619, + "eval_steps_per_second": 4.471, + "step": 770 + }, + { + "entropy": 0.6113292586803436, + "epoch": 2.0779220779220777, + "grad_norm": 0.37614893913269043, + "learning_rate": 0.00021406697290972404, + "loss": 0.5562425231933594, + "mean_token_accuracy": 0.8369060623645782, + "num_tokens": 2035968.0, + "step": 800 + }, + { + "entropy": 0.5720089420676231, + "epoch": 2.207792207792208, + "grad_norm": 0.35560500621795654, + "learning_rate": 0.00021211860917768236, + "loss": 0.521314697265625, + "mean_token_accuracy": 0.8453685063123703, + "num_tokens": 2168591.0, + "step": 850 + }, + { + "entropy": 0.5757493850588798, + "epoch": 2.3376623376623376, + "grad_norm": 0.304592490196228, + "learning_rate": 0.00020996225406798486, + "loss": 0.5258681106567383, + "mean_token_accuracy": 0.8431038129329681, + "num_tokens": 2297967.0, + "step": 900 + }, + { + "entropy": 0.5790414094924927, + "epoch": 2.4675324675324677, + "grad_norm": 0.29762616753578186, + "learning_rate": 0.00020760233835036664, + "loss": 0.5219763565063477, + "mean_token_accuracy": 0.8438457292318344, + "num_tokens": 2425614.0, + "step": 950 + }, + { + "entropy": 0.5783660891652107, + "epoch": 2.5974025974025974, + "grad_norm": 0.3418065011501312, + "learning_rate": 0.00020504371106063417, + "loss": 0.5258687210083007, + "mean_token_accuracy": 0.8422278153896332, + "num_tokens": 2557421.0, + "step": 1000 + }, + { + "entropy": 0.5746926316618919, + "epoch": 2.7272727272727275, + "grad_norm": 0.3708217740058899, + "learning_rate": 0.00020229162953711157, + "loss": 0.5161260223388672, + "mean_token_accuracy": 0.8453844922780991, + "num_tokens": 2680379.0, + "step": 1050 + }, + { + "entropy": 0.5676264691352845, + "epoch": 2.857142857142857, + "grad_norm": 0.26185691356658936, + "learning_rate": 0.00019935174861812654, + "loss": 0.5179851913452148, + "mean_token_accuracy": 0.8455151951313019, + "num_tokens": 2810854.0, + "step": 1100 + }, + { + "entropy": 0.5804974803328514, + "epoch": 2.987012987012987, + "grad_norm": 0.3131248652935028, + "learning_rate": 0.00019623010902273397, + "loss": 0.5243957138061524, + "mean_token_accuracy": 0.8455932134389877, + "num_tokens": 2932637.0, + "step": 1150 + }, + { + "epoch": 3.0, + "eval_entropy": 0.6151215266436338, + "eval_loss": 0.6557295918464661, + "eval_mean_token_accuracy": 0.8159411993737404, + "eval_num_tokens": 2944650.0, + "eval_runtime": 46.5152, + "eval_samples_per_second": 35.623, + "eval_steps_per_second": 4.472, + "step": 1155 + }, + { + "entropy": 0.5100037640333176, + "epoch": 3.116883116883117, + "grad_norm": 0.32568028569221497, + "learning_rate": 0.00019293312493855094, + "loss": 0.4522856140136719, + "mean_token_accuracy": 0.8600019490718842, + "num_tokens": 3054046.0, + "step": 1200 + }, + { + "entropy": 0.5058341425657272, + "epoch": 3.2467532467532467, + "grad_norm": 0.3810749650001526, + "learning_rate": 0.00018946757084220762, + "loss": 0.44891536712646485, + "mean_token_accuracy": 0.8606845206022262, + "num_tokens": 3182515.0, + "step": 1250 + }, + { + "entropy": 0.5006153827905655, + "epoch": 3.3766233766233764, + "grad_norm": 0.35473746061325073, + "learning_rate": 0.0001858405675794941, + "loss": 0.451065673828125, + "mean_token_accuracy": 0.8608392387628555, + "num_tokens": 3313672.0, + "step": 1300 + }, + { + "entropy": 0.508549126982689, + "epoch": 3.5064935064935066, + "grad_norm": 0.3414103388786316, + "learning_rate": 0.00018205956773380578, + "loss": 0.4535030746459961, + "mean_token_accuracy": 0.859148946404457, + "num_tokens": 3440689.0, + "step": 1350 + }, + { + "entropy": 0.516265479028225, + "epoch": 3.6363636363636362, + "grad_norm": 0.4597613513469696, + "learning_rate": 0.00017813234031295068, + "loss": 0.45882129669189453, + "mean_token_accuracy": 0.8582911169528962, + "num_tokens": 3566427.0, + "step": 1400 + }, + { + "entropy": 0.4902383416891098, + "epoch": 3.7662337662337664, + "grad_norm": 0.3385583460330963, + "learning_rate": 0.0001740669547857841, + "loss": 0.4417523193359375, + "mean_token_accuracy": 0.8632881045341492, + "num_tokens": 3699976.0, + "step": 1450 + }, + { + "entropy": 0.5104234129190445, + "epoch": 3.896103896103896, + "grad_norm": 0.3935626149177551, + "learning_rate": 0.00016987176450147088, + "loss": 0.4547672653198242, + "mean_token_accuracy": 0.8592299193143844, + "num_tokens": 3827926.0, + "step": 1500 + }, + { + "epoch": 4.0, + "eval_entropy": 0.565094108478381, + "eval_loss": 0.6673083305358887, + "eval_mean_token_accuracy": 0.8179781915476689, + "eval_num_tokens": 3926200.0, + "eval_runtime": 46.4987, + "eval_samples_per_second": 35.635, + "eval_steps_per_second": 4.473, + "step": 1540 + }, + { + "entropy": 0.49345644742250444, + "epoch": 4.025974025974026, + "grad_norm": 0.42027363181114197, + "learning_rate": 0.00016555538952544487, + "loss": 0.4355708312988281, + "mean_token_accuracy": 0.8648384511470795, + "num_tokens": 3953109.0, + "step": 1550 + }, + { + "entropy": 0.4247111546993256, + "epoch": 4.1558441558441555, + "grad_norm": 0.4478093087673187, + "learning_rate": 0.00016112669892733307, + "loss": 0.36362716674804685, + "mean_token_accuracy": 0.8844931781291961, + "num_tokens": 4081079.0, + "step": 1600 + }, + { + "entropy": 0.42337420970201495, + "epoch": 4.285714285714286, + "grad_norm": 0.35812026262283325, + "learning_rate": 0.00015659479255723875, + "loss": 0.3651982498168945, + "mean_token_accuracy": 0.8830066406726838, + "num_tokens": 4206426.0, + "step": 1650 + }, + { + "entropy": 0.43247521698474883, + "epoch": 4.415584415584416, + "grad_norm": 0.3665122985839844, + "learning_rate": 0.0001519689823478283, + "loss": 0.37368186950683596, + "mean_token_accuracy": 0.8800795775651932, + "num_tokens": 4329839.0, + "step": 1700 + }, + { + "entropy": 0.4328634282946587, + "epoch": 4.545454545454545, + "grad_norm": 0.33961260318756104, + "learning_rate": 0.00014725877318064152, + "loss": 0.37599964141845704, + "mean_token_accuracy": 0.8797144430875778, + "num_tokens": 4453940.0, + "step": 1750 + }, + { + "entropy": 0.42742862343788146, + "epoch": 4.675324675324675, + "grad_norm": 0.4312469959259033, + "learning_rate": 0.0001424738433559405, + "loss": 0.37354656219482424, + "mean_token_accuracy": 0.880826217532158, + "num_tokens": 4583626.0, + "step": 1800 + }, + { + "entropy": 0.4280877533555031, + "epoch": 4.805194805194805, + "grad_norm": 0.4343482255935669, + "learning_rate": 0.0001376240247062263, + "loss": 0.3688441467285156, + "mean_token_accuracy": 0.8814105206727981, + "num_tokens": 4713540.0, + "step": 1850 + }, + { + "entropy": 0.42952129155397417, + "epoch": 4.935064935064935, + "grad_norm": 0.47131651639938354, + "learning_rate": 0.00013271928239428512, + "loss": 0.37270416259765626, + "mean_token_accuracy": 0.8788579875230789, + "num_tokens": 4845678.0, + "step": 1900 + }, + { + "epoch": 5.0, + "eval_entropy": 0.5064233084424183, + "eval_loss": 0.689122200012207, + "eval_mean_token_accuracy": 0.818150080453891, + "eval_num_tokens": 4907750.0, + "eval_runtime": 46.5114, + "eval_samples_per_second": 35.626, + "eval_steps_per_second": 4.472, + "step": 1925 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.723982041528156e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e4d42893a69e354223672a3b0f9de6f2b979c004 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05284766149829996, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8ac719a0154a177a028f67b0e5c3ef960a7aca74 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/trainer_state.json @@ -0,0 +1,560 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.0, + "eval_steps": 500, + "global_step": 2310, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.8529596650600433, + "epoch": 0.12987012987012986, + "grad_norm": 0.878183901309967, + "learning_rate": 2.8227544379109735e-05, + "loss": 1.762685546875, + "mean_token_accuracy": 0.6396001759171486, + "num_tokens": 133878.0, + "step": 50 + }, + { + "entropy": 0.9309259909391403, + "epoch": 0.2597402597402597, + "grad_norm": 0.6279756426811218, + "learning_rate": 5.7031161092487016e-05, + "loss": 0.8922532653808594, + "mean_token_accuracy": 0.768746777176857, + "num_tokens": 265533.0, + "step": 100 + }, + { + "entropy": 0.8268384379148483, + "epoch": 0.38961038961038963, + "grad_norm": 0.5478948354721069, + "learning_rate": 8.583477780586432e-05, + "loss": 0.7756452178955078, + "mean_token_accuracy": 0.7925205600261688, + "num_tokens": 386480.0, + "step": 150 + }, + { + "entropy": 0.7658095824718475, + "epoch": 0.5194805194805194, + "grad_norm": 0.5972853899002075, + "learning_rate": 0.00011463839451924158, + "loss": 0.7161135101318359, + "mean_token_accuracy": 0.8019153982400894, + "num_tokens": 520135.0, + "step": 200 + }, + { + "entropy": 0.769344270825386, + "epoch": 0.6493506493506493, + "grad_norm": 0.5576531887054443, + "learning_rate": 0.0001434420112326189, + "loss": 0.716611557006836, + "mean_token_accuracy": 0.8028716742992401, + "num_tokens": 640303.0, + "step": 250 + }, + { + "entropy": 0.7470700722932816, + "epoch": 0.7792207792207793, + "grad_norm": 0.3937978148460388, + "learning_rate": 0.0001722456279459962, + "loss": 0.6983388519287109, + "mean_token_accuracy": 0.80596988260746, + "num_tokens": 766027.0, + "step": 300 + }, + { + "entropy": 0.7172233510017395, + "epoch": 0.9090909090909091, + "grad_norm": 0.32564300298690796, + "learning_rate": 0.00020104924465937345, + "loss": 0.6731016540527344, + "mean_token_accuracy": 0.8119878542423248, + "num_tokens": 894871.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.7397788639825124, + "eval_loss": 0.7304292917251587, + "eval_mean_token_accuracy": 0.7976073359067624, + "eval_num_tokens": 981550.0, + "eval_runtime": 46.5688, + "eval_samples_per_second": 35.582, + "eval_steps_per_second": 4.467, + "step": 385 + }, + { + "entropy": 0.7035581320524216, + "epoch": 1.0389610389610389, + "grad_norm": 0.45747122168540955, + "learning_rate": 0.00022177891520076015, + "loss": 0.6450813293457032, + "mean_token_accuracy": 0.8178210550546646, + "num_tokens": 1021041.0, + "step": 400 + }, + { + "entropy": 0.6679977881908417, + "epoch": 1.1688311688311688, + "grad_norm": 0.4456019401550293, + "learning_rate": 0.0002216012068086725, + "loss": 0.6189227294921875, + "mean_token_accuracy": 0.8257622331380844, + "num_tokens": 1149161.0, + "step": 450 + }, + { + "entropy": 0.6574469250440598, + "epoch": 1.2987012987012987, + "grad_norm": 0.3784359097480774, + "learning_rate": 0.00022119602267552194, + "loss": 0.6098381805419922, + "mean_token_accuracy": 0.8251238936185836, + "num_tokens": 1278079.0, + "step": 500 + }, + { + "entropy": 0.6593573099374771, + "epoch": 1.4285714285714286, + "grad_norm": 0.36508747935295105, + "learning_rate": 0.00022056419535323196, + "loss": 0.6137563705444335, + "mean_token_accuracy": 0.824503253698349, + "num_tokens": 1405326.0, + "step": 550 + }, + { + "entropy": 0.6459340593218803, + "epoch": 1.5584415584415585, + "grad_norm": 0.39773380756378174, + "learning_rate": 0.00021970702308872148, + "loss": 0.5986767196655274, + "mean_token_accuracy": 0.8270765203237533, + "num_tokens": 1532507.0, + "step": 600 + }, + { + "entropy": 0.633097175359726, + "epoch": 1.6883116883116882, + "grad_norm": 0.41731834411621094, + "learning_rate": 0.00021862626715633265, + "loss": 0.5849922180175782, + "mean_token_accuracy": 0.8313153618574143, + "num_tokens": 1662747.0, + "step": 650 + }, + { + "entropy": 0.6569280475378036, + "epoch": 1.8181818181818183, + "grad_norm": 0.3067665100097656, + "learning_rate": 0.00021732414823885307, + "loss": 0.605062370300293, + "mean_token_accuracy": 0.8269566106796264, + "num_tokens": 1783817.0, + "step": 700 + }, + { + "entropy": 0.6459099870920181, + "epoch": 1.948051948051948, + "grad_norm": 0.5508913397789001, + "learning_rate": 0.00021580334186456886, + "loss": 0.596672134399414, + "mean_token_accuracy": 0.8287447422742844, + "num_tokens": 1914573.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.6985598478752834, + "eval_loss": 0.6716415286064148, + "eval_mean_token_accuracy": 0.808078097895934, + "eval_num_tokens": 1963100.0, + "eval_runtime": 46.5197, + "eval_samples_per_second": 35.619, + "eval_steps_per_second": 4.471, + "step": 770 + }, + { + "entropy": 0.6113292586803436, + "epoch": 2.0779220779220777, + "grad_norm": 0.37614893913269043, + "learning_rate": 0.00021406697290972404, + "loss": 0.5562425231933594, + "mean_token_accuracy": 0.8369060623645782, + "num_tokens": 2035968.0, + "step": 800 + }, + { + "entropy": 0.5720089420676231, + "epoch": 2.207792207792208, + "grad_norm": 0.35560500621795654, + "learning_rate": 0.00021211860917768236, + "loss": 0.521314697265625, + "mean_token_accuracy": 0.8453685063123703, + "num_tokens": 2168591.0, + "step": 850 + }, + { + "entropy": 0.5757493850588798, + "epoch": 2.3376623376623376, + "grad_norm": 0.304592490196228, + "learning_rate": 0.00020996225406798486, + "loss": 0.5258681106567383, + "mean_token_accuracy": 0.8431038129329681, + "num_tokens": 2297967.0, + "step": 900 + }, + { + "entropy": 0.5790414094924927, + "epoch": 2.4675324675324677, + "grad_norm": 0.29762616753578186, + "learning_rate": 0.00020760233835036664, + "loss": 0.5219763565063477, + "mean_token_accuracy": 0.8438457292318344, + "num_tokens": 2425614.0, + "step": 950 + }, + { + "entropy": 0.5783660891652107, + "epoch": 2.5974025974025974, + "grad_norm": 0.3418065011501312, + "learning_rate": 0.00020504371106063417, + "loss": 0.5258687210083007, + "mean_token_accuracy": 0.8422278153896332, + "num_tokens": 2557421.0, + "step": 1000 + }, + { + "entropy": 0.5746926316618919, + "epoch": 2.7272727272727275, + "grad_norm": 0.3708217740058899, + "learning_rate": 0.00020229162953711157, + "loss": 0.5161260223388672, + "mean_token_accuracy": 0.8453844922780991, + "num_tokens": 2680379.0, + "step": 1050 + }, + { + "entropy": 0.5676264691352845, + "epoch": 2.857142857142857, + "grad_norm": 0.26185691356658936, + "learning_rate": 0.00019935174861812654, + "loss": 0.5179851913452148, + "mean_token_accuracy": 0.8455151951313019, + "num_tokens": 2810854.0, + "step": 1100 + }, + { + "entropy": 0.5804974803328514, + "epoch": 2.987012987012987, + "grad_norm": 0.3131248652935028, + "learning_rate": 0.00019623010902273397, + "loss": 0.5243957138061524, + "mean_token_accuracy": 0.8455932134389877, + "num_tokens": 2932637.0, + "step": 1150 + }, + { + "epoch": 3.0, + "eval_entropy": 0.6151215266436338, + "eval_loss": 0.6557295918464661, + "eval_mean_token_accuracy": 0.8159411993737404, + "eval_num_tokens": 2944650.0, + "eval_runtime": 46.5152, + "eval_samples_per_second": 35.623, + "eval_steps_per_second": 4.472, + "step": 1155 + }, + { + "entropy": 0.5100037640333176, + "epoch": 3.116883116883117, + "grad_norm": 0.32568028569221497, + "learning_rate": 0.00019293312493855094, + "loss": 0.4522856140136719, + "mean_token_accuracy": 0.8600019490718842, + "num_tokens": 3054046.0, + "step": 1200 + }, + { + "entropy": 0.5058341425657272, + "epoch": 3.2467532467532467, + "grad_norm": 0.3810749650001526, + "learning_rate": 0.00018946757084220762, + "loss": 0.44891536712646485, + "mean_token_accuracy": 0.8606845206022262, + "num_tokens": 3182515.0, + "step": 1250 + }, + { + "entropy": 0.5006153827905655, + "epoch": 3.3766233766233764, + "grad_norm": 0.35473746061325073, + "learning_rate": 0.0001858405675794941, + "loss": 0.451065673828125, + "mean_token_accuracy": 0.8608392387628555, + "num_tokens": 3313672.0, + "step": 1300 + }, + { + "entropy": 0.508549126982689, + "epoch": 3.5064935064935066, + "grad_norm": 0.3414103388786316, + "learning_rate": 0.00018205956773380578, + "loss": 0.4535030746459961, + "mean_token_accuracy": 0.859148946404457, + "num_tokens": 3440689.0, + "step": 1350 + }, + { + "entropy": 0.516265479028225, + "epoch": 3.6363636363636362, + "grad_norm": 0.4597613513469696, + "learning_rate": 0.00017813234031295068, + "loss": 0.45882129669189453, + "mean_token_accuracy": 0.8582911169528962, + "num_tokens": 3566427.0, + "step": 1400 + }, + { + "entropy": 0.4902383416891098, + "epoch": 3.7662337662337664, + "grad_norm": 0.3385583460330963, + "learning_rate": 0.0001740669547857841, + "loss": 0.4417523193359375, + "mean_token_accuracy": 0.8632881045341492, + "num_tokens": 3699976.0, + "step": 1450 + }, + { + "entropy": 0.5104234129190445, + "epoch": 3.896103896103896, + "grad_norm": 0.3935626149177551, + "learning_rate": 0.00016987176450147088, + "loss": 0.4547672653198242, + "mean_token_accuracy": 0.8592299193143844, + "num_tokens": 3827926.0, + "step": 1500 + }, + { + "epoch": 4.0, + "eval_entropy": 0.565094108478381, + "eval_loss": 0.6673083305358887, + "eval_mean_token_accuracy": 0.8179781915476689, + "eval_num_tokens": 3926200.0, + "eval_runtime": 46.4987, + "eval_samples_per_second": 35.635, + "eval_steps_per_second": 4.473, + "step": 1540 + }, + { + "entropy": 0.49345644742250444, + "epoch": 4.025974025974026, + "grad_norm": 0.42027363181114197, + "learning_rate": 0.00016555538952544487, + "loss": 0.4355708312988281, + "mean_token_accuracy": 0.8648384511470795, + "num_tokens": 3953109.0, + "step": 1550 + }, + { + "entropy": 0.4247111546993256, + "epoch": 4.1558441558441555, + "grad_norm": 0.4478093087673187, + "learning_rate": 0.00016112669892733307, + "loss": 0.36362716674804685, + "mean_token_accuracy": 0.8844931781291961, + "num_tokens": 4081079.0, + "step": 1600 + }, + { + "entropy": 0.42337420970201495, + "epoch": 4.285714285714286, + "grad_norm": 0.35812026262283325, + "learning_rate": 0.00015659479255723875, + "loss": 0.3651982498168945, + "mean_token_accuracy": 0.8830066406726838, + "num_tokens": 4206426.0, + "step": 1650 + }, + { + "entropy": 0.43247521698474883, + "epoch": 4.415584415584416, + "grad_norm": 0.3665122985839844, + "learning_rate": 0.0001519689823478283, + "loss": 0.37368186950683596, + "mean_token_accuracy": 0.8800795775651932, + "num_tokens": 4329839.0, + "step": 1700 + }, + { + "entropy": 0.4328634282946587, + "epoch": 4.545454545454545, + "grad_norm": 0.33961260318756104, + "learning_rate": 0.00014725877318064152, + "loss": 0.37599964141845704, + "mean_token_accuracy": 0.8797144430875778, + "num_tokens": 4453940.0, + "step": 1750 + }, + { + "entropy": 0.42742862343788146, + "epoch": 4.675324675324675, + "grad_norm": 0.4312469959259033, + "learning_rate": 0.0001424738433559405, + "loss": 0.37354656219482424, + "mean_token_accuracy": 0.880826217532158, + "num_tokens": 4583626.0, + "step": 1800 + }, + { + "entropy": 0.4280877533555031, + "epoch": 4.805194805194805, + "grad_norm": 0.4343482255935669, + "learning_rate": 0.0001376240247062263, + "loss": 0.3688441467285156, + "mean_token_accuracy": 0.8814105206727981, + "num_tokens": 4713540.0, + "step": 1850 + }, + { + "entropy": 0.42952129155397417, + "epoch": 4.935064935064935, + "grad_norm": 0.47131651639938354, + "learning_rate": 0.00013271928239428512, + "loss": 0.37270416259765626, + "mean_token_accuracy": 0.8788579875230789, + "num_tokens": 4845678.0, + "step": 1900 + }, + { + "epoch": 5.0, + "eval_entropy": 0.5064233084424183, + "eval_loss": 0.689122200012207, + "eval_mean_token_accuracy": 0.818150080453891, + "eval_num_tokens": 4907750.0, + "eval_runtime": 46.5114, + "eval_samples_per_second": 35.626, + "eval_steps_per_second": 4.472, + "step": 1925 + }, + { + "entropy": 0.3798492255806923, + "epoch": 5.064935064935065, + "grad_norm": 0.414468377828598, + "learning_rate": 0.0001277696944372747, + "loss": 0.31735713958740236, + "mean_token_accuracy": 0.8974772602319717, + "num_tokens": 4975955.0, + "step": 1950 + }, + { + "entropy": 0.3367840954661369, + "epoch": 5.194805194805195, + "grad_norm": 0.4515029191970825, + "learning_rate": 0.00012278543099892257, + "loss": 0.272756290435791, + "mean_token_accuracy": 0.9084931749105454, + "num_tokens": 5104770.0, + "step": 2000 + }, + { + "entropy": 0.34766108095645903, + "epoch": 5.324675324675325, + "grad_norm": 0.4040578007698059, + "learning_rate": 0.00011777673349238672, + "loss": 0.2792487144470215, + "mean_token_accuracy": 0.9065623581409454, + "num_tokens": 5229052.0, + "step": 2050 + }, + { + "entropy": 0.3566804251074791, + "epoch": 5.454545454545454, + "grad_norm": 0.5059600472450256, + "learning_rate": 0.00011275389353671628, + "loss": 0.2896596145629883, + "mean_token_accuracy": 0.9045185309648514, + "num_tokens": 5357753.0, + "step": 2100 + }, + { + "entropy": 0.3462292793393135, + "epoch": 5.584415584415584, + "grad_norm": 0.4664144217967987, + "learning_rate": 0.00010772723181015153, + "loss": 0.27794593811035156, + "mean_token_accuracy": 0.9075321304798126, + "num_tokens": 5481550.0, + "step": 2150 + }, + { + "entropy": 0.3434346827864647, + "epoch": 5.714285714285714, + "grad_norm": 0.46017780900001526, + "learning_rate": 0.00010270707684371499, + "loss": 0.2783885383605957, + "mean_token_accuracy": 0.9063384455442428, + "num_tokens": 5609104.0, + "step": 2200 + }, + { + "entropy": 0.341832632124424, + "epoch": 5.8441558441558445, + "grad_norm": 0.429608017206192, + "learning_rate": 9.77037437986665e-05, + "loss": 0.2815263748168945, + "mean_token_accuracy": 0.9060239523649216, + "num_tokens": 5737347.0, + "step": 2250 + }, + { + "entropy": 0.3492265248298645, + "epoch": 5.974025974025974, + "grad_norm": 0.5019266605377197, + "learning_rate": 9.272751327143021e-05, + "loss": 0.2844840621948242, + "mean_token_accuracy": 0.9042869365215301, + "num_tokens": 5861872.0, + "step": 2300 + }, + { + "epoch": 6.0, + "eval_entropy": 0.4365639974578069, + "eval_loss": 0.7624168395996094, + "eval_mean_token_accuracy": 0.8165808867376584, + "eval_num_tokens": 5889300.0, + "eval_runtime": 46.5221, + "eval_samples_per_second": 35.617, + "eval_steps_per_second": 4.471, + "step": 2310 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.26902097163009e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e4d42893a69e354223672a3b0f9de6f2b979c004 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05284766149829996, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6333976c357544886163bf2eea9f10e174b8af47 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/trainer_state.json @@ -0,0 +1,641 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.0, + "eval_steps": 500, + "global_step": 2695, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.8529596650600433, + "epoch": 0.12987012987012986, + "grad_norm": 0.878183901309967, + "learning_rate": 2.8227544379109735e-05, + "loss": 1.762685546875, + "mean_token_accuracy": 0.6396001759171486, + "num_tokens": 133878.0, + "step": 50 + }, + { + "entropy": 0.9309259909391403, + "epoch": 0.2597402597402597, + "grad_norm": 0.6279756426811218, + "learning_rate": 5.7031161092487016e-05, + "loss": 0.8922532653808594, + "mean_token_accuracy": 0.768746777176857, + "num_tokens": 265533.0, + "step": 100 + }, + { + "entropy": 0.8268384379148483, + "epoch": 0.38961038961038963, + "grad_norm": 0.5478948354721069, + "learning_rate": 8.583477780586432e-05, + "loss": 0.7756452178955078, + "mean_token_accuracy": 0.7925205600261688, + "num_tokens": 386480.0, + "step": 150 + }, + { + "entropy": 0.7658095824718475, + "epoch": 0.5194805194805194, + "grad_norm": 0.5972853899002075, + "learning_rate": 0.00011463839451924158, + "loss": 0.7161135101318359, + "mean_token_accuracy": 0.8019153982400894, + "num_tokens": 520135.0, + "step": 200 + }, + { + "entropy": 0.769344270825386, + "epoch": 0.6493506493506493, + "grad_norm": 0.5576531887054443, + "learning_rate": 0.0001434420112326189, + "loss": 0.716611557006836, + "mean_token_accuracy": 0.8028716742992401, + "num_tokens": 640303.0, + "step": 250 + }, + { + "entropy": 0.7470700722932816, + "epoch": 0.7792207792207793, + "grad_norm": 0.3937978148460388, + "learning_rate": 0.0001722456279459962, + "loss": 0.6983388519287109, + "mean_token_accuracy": 0.80596988260746, + "num_tokens": 766027.0, + "step": 300 + }, + { + "entropy": 0.7172233510017395, + "epoch": 0.9090909090909091, + "grad_norm": 0.32564300298690796, + "learning_rate": 0.00020104924465937345, + "loss": 0.6731016540527344, + "mean_token_accuracy": 0.8119878542423248, + "num_tokens": 894871.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.7397788639825124, + "eval_loss": 0.7304292917251587, + "eval_mean_token_accuracy": 0.7976073359067624, + "eval_num_tokens": 981550.0, + "eval_runtime": 46.5688, + "eval_samples_per_second": 35.582, + "eval_steps_per_second": 4.467, + "step": 385 + }, + { + "entropy": 0.7035581320524216, + "epoch": 1.0389610389610389, + "grad_norm": 0.45747122168540955, + "learning_rate": 0.00022177891520076015, + "loss": 0.6450813293457032, + "mean_token_accuracy": 0.8178210550546646, + "num_tokens": 1021041.0, + "step": 400 + }, + { + "entropy": 0.6679977881908417, + "epoch": 1.1688311688311688, + "grad_norm": 0.4456019401550293, + "learning_rate": 0.0002216012068086725, + "loss": 0.6189227294921875, + "mean_token_accuracy": 0.8257622331380844, + "num_tokens": 1149161.0, + "step": 450 + }, + { + "entropy": 0.6574469250440598, + "epoch": 1.2987012987012987, + "grad_norm": 0.3784359097480774, + "learning_rate": 0.00022119602267552194, + "loss": 0.6098381805419922, + "mean_token_accuracy": 0.8251238936185836, + "num_tokens": 1278079.0, + "step": 500 + }, + { + "entropy": 0.6593573099374771, + "epoch": 1.4285714285714286, + "grad_norm": 0.36508747935295105, + "learning_rate": 0.00022056419535323196, + "loss": 0.6137563705444335, + "mean_token_accuracy": 0.824503253698349, + "num_tokens": 1405326.0, + "step": 550 + }, + { + "entropy": 0.6459340593218803, + "epoch": 1.5584415584415585, + "grad_norm": 0.39773380756378174, + "learning_rate": 0.00021970702308872148, + "loss": 0.5986767196655274, + "mean_token_accuracy": 0.8270765203237533, + "num_tokens": 1532507.0, + "step": 600 + }, + { + "entropy": 0.633097175359726, + "epoch": 1.6883116883116882, + "grad_norm": 0.41731834411621094, + "learning_rate": 0.00021862626715633265, + "loss": 0.5849922180175782, + "mean_token_accuracy": 0.8313153618574143, + "num_tokens": 1662747.0, + "step": 650 + }, + { + "entropy": 0.6569280475378036, + "epoch": 1.8181818181818183, + "grad_norm": 0.3067665100097656, + "learning_rate": 0.00021732414823885307, + "loss": 0.605062370300293, + "mean_token_accuracy": 0.8269566106796264, + "num_tokens": 1783817.0, + "step": 700 + }, + { + "entropy": 0.6459099870920181, + "epoch": 1.948051948051948, + "grad_norm": 0.5508913397789001, + "learning_rate": 0.00021580334186456886, + "loss": 0.596672134399414, + "mean_token_accuracy": 0.8287447422742844, + "num_tokens": 1914573.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.6985598478752834, + "eval_loss": 0.6716415286064148, + "eval_mean_token_accuracy": 0.808078097895934, + "eval_num_tokens": 1963100.0, + "eval_runtime": 46.5197, + "eval_samples_per_second": 35.619, + "eval_steps_per_second": 4.471, + "step": 770 + }, + { + "entropy": 0.6113292586803436, + "epoch": 2.0779220779220777, + "grad_norm": 0.37614893913269043, + "learning_rate": 0.00021406697290972404, + "loss": 0.5562425231933594, + "mean_token_accuracy": 0.8369060623645782, + "num_tokens": 2035968.0, + "step": 800 + }, + { + "entropy": 0.5720089420676231, + "epoch": 2.207792207792208, + "grad_norm": 0.35560500621795654, + "learning_rate": 0.00021211860917768236, + "loss": 0.521314697265625, + "mean_token_accuracy": 0.8453685063123703, + "num_tokens": 2168591.0, + "step": 850 + }, + { + "entropy": 0.5757493850588798, + "epoch": 2.3376623376623376, + "grad_norm": 0.304592490196228, + "learning_rate": 0.00020996225406798486, + "loss": 0.5258681106567383, + "mean_token_accuracy": 0.8431038129329681, + "num_tokens": 2297967.0, + "step": 900 + }, + { + "entropy": 0.5790414094924927, + "epoch": 2.4675324675324677, + "grad_norm": 0.29762616753578186, + "learning_rate": 0.00020760233835036664, + "loss": 0.5219763565063477, + "mean_token_accuracy": 0.8438457292318344, + "num_tokens": 2425614.0, + "step": 950 + }, + { + "entropy": 0.5783660891652107, + "epoch": 2.5974025974025974, + "grad_norm": 0.3418065011501312, + "learning_rate": 0.00020504371106063417, + "loss": 0.5258687210083007, + "mean_token_accuracy": 0.8422278153896332, + "num_tokens": 2557421.0, + "step": 1000 + }, + { + "entropy": 0.5746926316618919, + "epoch": 2.7272727272727275, + "grad_norm": 0.3708217740058899, + "learning_rate": 0.00020229162953711157, + "loss": 0.5161260223388672, + "mean_token_accuracy": 0.8453844922780991, + "num_tokens": 2680379.0, + "step": 1050 + }, + { + "entropy": 0.5676264691352845, + "epoch": 2.857142857142857, + "grad_norm": 0.26185691356658936, + "learning_rate": 0.00019935174861812654, + "loss": 0.5179851913452148, + "mean_token_accuracy": 0.8455151951313019, + "num_tokens": 2810854.0, + "step": 1100 + }, + { + "entropy": 0.5804974803328514, + "epoch": 2.987012987012987, + "grad_norm": 0.3131248652935028, + "learning_rate": 0.00019623010902273397, + "loss": 0.5243957138061524, + "mean_token_accuracy": 0.8455932134389877, + "num_tokens": 2932637.0, + "step": 1150 + }, + { + "epoch": 3.0, + "eval_entropy": 0.6151215266436338, + "eval_loss": 0.6557295918464661, + "eval_mean_token_accuracy": 0.8159411993737404, + "eval_num_tokens": 2944650.0, + "eval_runtime": 46.5152, + "eval_samples_per_second": 35.623, + "eval_steps_per_second": 4.472, + "step": 1155 + }, + { + "entropy": 0.5100037640333176, + "epoch": 3.116883116883117, + "grad_norm": 0.32568028569221497, + "learning_rate": 0.00019293312493855094, + "loss": 0.4522856140136719, + "mean_token_accuracy": 0.8600019490718842, + "num_tokens": 3054046.0, + "step": 1200 + }, + { + "entropy": 0.5058341425657272, + "epoch": 3.2467532467532467, + "grad_norm": 0.3810749650001526, + "learning_rate": 0.00018946757084220762, + "loss": 0.44891536712646485, + "mean_token_accuracy": 0.8606845206022262, + "num_tokens": 3182515.0, + "step": 1250 + }, + { + "entropy": 0.5006153827905655, + "epoch": 3.3766233766233764, + "grad_norm": 0.35473746061325073, + "learning_rate": 0.0001858405675794941, + "loss": 0.451065673828125, + "mean_token_accuracy": 0.8608392387628555, + "num_tokens": 3313672.0, + "step": 1300 + }, + { + "entropy": 0.508549126982689, + "epoch": 3.5064935064935066, + "grad_norm": 0.3414103388786316, + "learning_rate": 0.00018205956773380578, + "loss": 0.4535030746459961, + "mean_token_accuracy": 0.859148946404457, + "num_tokens": 3440689.0, + "step": 1350 + }, + { + "entropy": 0.516265479028225, + "epoch": 3.6363636363636362, + "grad_norm": 0.4597613513469696, + "learning_rate": 0.00017813234031295068, + "loss": 0.45882129669189453, + "mean_token_accuracy": 0.8582911169528962, + "num_tokens": 3566427.0, + "step": 1400 + }, + { + "entropy": 0.4902383416891098, + "epoch": 3.7662337662337664, + "grad_norm": 0.3385583460330963, + "learning_rate": 0.0001740669547857841, + "loss": 0.4417523193359375, + "mean_token_accuracy": 0.8632881045341492, + "num_tokens": 3699976.0, + "step": 1450 + }, + { + "entropy": 0.5104234129190445, + "epoch": 3.896103896103896, + "grad_norm": 0.3935626149177551, + "learning_rate": 0.00016987176450147088, + "loss": 0.4547672653198242, + "mean_token_accuracy": 0.8592299193143844, + "num_tokens": 3827926.0, + "step": 1500 + }, + { + "epoch": 4.0, + "eval_entropy": 0.565094108478381, + "eval_loss": 0.6673083305358887, + "eval_mean_token_accuracy": 0.8179781915476689, + "eval_num_tokens": 3926200.0, + "eval_runtime": 46.4987, + "eval_samples_per_second": 35.635, + "eval_steps_per_second": 4.473, + "step": 1540 + }, + { + "entropy": 0.49345644742250444, + "epoch": 4.025974025974026, + "grad_norm": 0.42027363181114197, + "learning_rate": 0.00016555538952544487, + "loss": 0.4355708312988281, + "mean_token_accuracy": 0.8648384511470795, + "num_tokens": 3953109.0, + "step": 1550 + }, + { + "entropy": 0.4247111546993256, + "epoch": 4.1558441558441555, + "grad_norm": 0.4478093087673187, + "learning_rate": 0.00016112669892733307, + "loss": 0.36362716674804685, + "mean_token_accuracy": 0.8844931781291961, + "num_tokens": 4081079.0, + "step": 1600 + }, + { + "entropy": 0.42337420970201495, + "epoch": 4.285714285714286, + "grad_norm": 0.35812026262283325, + "learning_rate": 0.00015659479255723875, + "loss": 0.3651982498168945, + "mean_token_accuracy": 0.8830066406726838, + "num_tokens": 4206426.0, + "step": 1650 + }, + { + "entropy": 0.43247521698474883, + "epoch": 4.415584415584416, + "grad_norm": 0.3665122985839844, + "learning_rate": 0.0001519689823478283, + "loss": 0.37368186950683596, + "mean_token_accuracy": 0.8800795775651932, + "num_tokens": 4329839.0, + "step": 1700 + }, + { + "entropy": 0.4328634282946587, + "epoch": 4.545454545454545, + "grad_norm": 0.33961260318756104, + "learning_rate": 0.00014725877318064152, + "loss": 0.37599964141845704, + "mean_token_accuracy": 0.8797144430875778, + "num_tokens": 4453940.0, + "step": 1750 + }, + { + "entropy": 0.42742862343788146, + "epoch": 4.675324675324675, + "grad_norm": 0.4312469959259033, + "learning_rate": 0.0001424738433559405, + "loss": 0.37354656219482424, + "mean_token_accuracy": 0.880826217532158, + "num_tokens": 4583626.0, + "step": 1800 + }, + { + "entropy": 0.4280877533555031, + "epoch": 4.805194805194805, + "grad_norm": 0.4343482255935669, + "learning_rate": 0.0001376240247062263, + "loss": 0.3688441467285156, + "mean_token_accuracy": 0.8814105206727981, + "num_tokens": 4713540.0, + "step": 1850 + }, + { + "entropy": 0.42952129155397417, + "epoch": 4.935064935064935, + "grad_norm": 0.47131651639938354, + "learning_rate": 0.00013271928239428512, + "loss": 0.37270416259765626, + "mean_token_accuracy": 0.8788579875230789, + "num_tokens": 4845678.0, + "step": 1900 + }, + { + "epoch": 5.0, + "eval_entropy": 0.5064233084424183, + "eval_loss": 0.689122200012207, + "eval_mean_token_accuracy": 0.818150080453891, + "eval_num_tokens": 4907750.0, + "eval_runtime": 46.5114, + "eval_samples_per_second": 35.626, + "eval_steps_per_second": 4.472, + "step": 1925 + }, + { + "entropy": 0.3798492255806923, + "epoch": 5.064935064935065, + "grad_norm": 0.414468377828598, + "learning_rate": 0.0001277696944372747, + "loss": 0.31735713958740236, + "mean_token_accuracy": 0.8974772602319717, + "num_tokens": 4975955.0, + "step": 1950 + }, + { + "entropy": 0.3367840954661369, + "epoch": 5.194805194805195, + "grad_norm": 0.4515029191970825, + "learning_rate": 0.00012278543099892257, + "loss": 0.272756290435791, + "mean_token_accuracy": 0.9084931749105454, + "num_tokens": 5104770.0, + "step": 2000 + }, + { + "entropy": 0.34766108095645903, + "epoch": 5.324675324675325, + "grad_norm": 0.4040578007698059, + "learning_rate": 0.00011777673349238672, + "loss": 0.2792487144470215, + "mean_token_accuracy": 0.9065623581409454, + "num_tokens": 5229052.0, + "step": 2050 + }, + { + "entropy": 0.3566804251074791, + "epoch": 5.454545454545454, + "grad_norm": 0.5059600472450256, + "learning_rate": 0.00011275389353671628, + "loss": 0.2896596145629883, + "mean_token_accuracy": 0.9045185309648514, + "num_tokens": 5357753.0, + "step": 2100 + }, + { + "entropy": 0.3462292793393135, + "epoch": 5.584415584415584, + "grad_norm": 0.4664144217967987, + "learning_rate": 0.00010772723181015153, + "loss": 0.27794593811035156, + "mean_token_accuracy": 0.9075321304798126, + "num_tokens": 5481550.0, + "step": 2150 + }, + { + "entropy": 0.3434346827864647, + "epoch": 5.714285714285714, + "grad_norm": 0.46017780900001526, + "learning_rate": 0.00010270707684371499, + "loss": 0.2783885383605957, + "mean_token_accuracy": 0.9063384455442428, + "num_tokens": 5609104.0, + "step": 2200 + }, + { + "entropy": 0.341832632124424, + "epoch": 5.8441558441558445, + "grad_norm": 0.429608017206192, + "learning_rate": 9.77037437986665e-05, + "loss": 0.2815263748168945, + "mean_token_accuracy": 0.9060239523649216, + "num_tokens": 5737347.0, + "step": 2250 + }, + { + "entropy": 0.3492265248298645, + "epoch": 5.974025974025974, + "grad_norm": 0.5019266605377197, + "learning_rate": 9.272751327143021e-05, + "loss": 0.2844840621948242, + "mean_token_accuracy": 0.9042869365215301, + "num_tokens": 5861872.0, + "step": 2300 + }, + { + "epoch": 6.0, + "eval_entropy": 0.4365639974578069, + "eval_loss": 0.7624168395996094, + "eval_mean_token_accuracy": 0.8165808867376584, + "eval_num_tokens": 5889300.0, + "eval_runtime": 46.5221, + "eval_samples_per_second": 35.617, + "eval_steps_per_second": 4.471, + "step": 2310 + }, + { + "entropy": 0.28290177062153815, + "epoch": 6.103896103896104, + "grad_norm": 0.5457249283790588, + "learning_rate": 8.77886101695435e-05, + "loss": 0.2029383087158203, + "mean_token_accuracy": 0.9317537224292756, + "num_tokens": 5990679.0, + "step": 2350 + }, + { + "entropy": 0.2539422053098679, + "epoch": 6.233766233766234, + "grad_norm": 1.7734259366989136, + "learning_rate": 8.289718270203239e-05, + "loss": 0.1847425079345703, + "mean_token_accuracy": 0.937881036400795, + "num_tokens": 6117918.0, + "step": 2400 + }, + { + "entropy": 0.2555230759084225, + "epoch": 6.363636363636363, + "grad_norm": 0.49776697158813477, + "learning_rate": 7.806328152738371e-05, + "loss": 0.18783441543579102, + "mean_token_accuracy": 0.936203356385231, + "num_tokens": 6248566.0, + "step": 2450 + }, + { + "entropy": 0.2529813493788242, + "epoch": 6.4935064935064934, + "grad_norm": 0.4968299865722656, + "learning_rate": 7.32968391019587e-05, + "loss": 0.18458471298217774, + "mean_token_accuracy": 0.9365487760305404, + "num_tokens": 6380529.0, + "step": 2500 + }, + { + "entropy": 0.26623836129903794, + "epoch": 6.623376623376624, + "grad_norm": 0.6177894473075867, + "learning_rate": 6.860764927128271e-05, + "loss": 0.19328956604003905, + "mean_token_accuracy": 0.9330078029632568, + "num_tokens": 6501669.0, + "step": 2550 + }, + { + "entropy": 0.2671951600909233, + "epoch": 6.753246753246753, + "grad_norm": 0.6792670488357544, + "learning_rate": 6.400534714614501e-05, + "loss": 0.19405254364013672, + "mean_token_accuracy": 0.9335101181268692, + "num_tokens": 6624404.0, + "step": 2600 + }, + { + "entropy": 0.2522234851121902, + "epoch": 6.883116883116883, + "grad_norm": 0.4798950254917145, + "learning_rate": 5.949938930485951e-05, + "loss": 0.1846565818786621, + "mean_token_accuracy": 0.9369161009788514, + "num_tokens": 6755532.0, + "step": 2650 + }, + { + "epoch": 7.0, + "eval_entropy": 0.3764066012719503, + "eval_loss": 0.8554975390434265, + "eval_mean_token_accuracy": 0.8132733049301001, + "eval_num_tokens": 6870850.0, + "eval_runtime": 46.525, + "eval_samples_per_second": 35.615, + "eval_steps_per_second": 4.471, + "step": 2695 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.810735902601032e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e4d42893a69e354223672a3b0f9de6f2b979c004 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05284766149829996, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9416cb221138e648d560c5a35124f78102baaacf --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/trainer_state.json @@ -0,0 +1,732 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.0, + "eval_steps": 500, + "global_step": 3080, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.8529596650600433, + "epoch": 0.12987012987012986, + "grad_norm": 0.878183901309967, + "learning_rate": 2.8227544379109735e-05, + "loss": 1.762685546875, + "mean_token_accuracy": 0.6396001759171486, + "num_tokens": 133878.0, + "step": 50 + }, + { + "entropy": 0.9309259909391403, + "epoch": 0.2597402597402597, + "grad_norm": 0.6279756426811218, + "learning_rate": 5.7031161092487016e-05, + "loss": 0.8922532653808594, + "mean_token_accuracy": 0.768746777176857, + "num_tokens": 265533.0, + "step": 100 + }, + { + "entropy": 0.8268384379148483, + "epoch": 0.38961038961038963, + "grad_norm": 0.5478948354721069, + "learning_rate": 8.583477780586432e-05, + "loss": 0.7756452178955078, + "mean_token_accuracy": 0.7925205600261688, + "num_tokens": 386480.0, + "step": 150 + }, + { + "entropy": 0.7658095824718475, + "epoch": 0.5194805194805194, + "grad_norm": 0.5972853899002075, + "learning_rate": 0.00011463839451924158, + "loss": 0.7161135101318359, + "mean_token_accuracy": 0.8019153982400894, + "num_tokens": 520135.0, + "step": 200 + }, + { + "entropy": 0.769344270825386, + "epoch": 0.6493506493506493, + "grad_norm": 0.5576531887054443, + "learning_rate": 0.0001434420112326189, + "loss": 0.716611557006836, + "mean_token_accuracy": 0.8028716742992401, + "num_tokens": 640303.0, + "step": 250 + }, + { + "entropy": 0.7470700722932816, + "epoch": 0.7792207792207793, + "grad_norm": 0.3937978148460388, + "learning_rate": 0.0001722456279459962, + "loss": 0.6983388519287109, + "mean_token_accuracy": 0.80596988260746, + "num_tokens": 766027.0, + "step": 300 + }, + { + "entropy": 0.7172233510017395, + "epoch": 0.9090909090909091, + "grad_norm": 0.32564300298690796, + "learning_rate": 0.00020104924465937345, + "loss": 0.6731016540527344, + "mean_token_accuracy": 0.8119878542423248, + "num_tokens": 894871.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.7397788639825124, + "eval_loss": 0.7304292917251587, + "eval_mean_token_accuracy": 0.7976073359067624, + "eval_num_tokens": 981550.0, + "eval_runtime": 46.5688, + "eval_samples_per_second": 35.582, + "eval_steps_per_second": 4.467, + "step": 385 + }, + { + "entropy": 0.7035581320524216, + "epoch": 1.0389610389610389, + "grad_norm": 0.45747122168540955, + "learning_rate": 0.00022177891520076015, + "loss": 0.6450813293457032, + "mean_token_accuracy": 0.8178210550546646, + "num_tokens": 1021041.0, + "step": 400 + }, + { + "entropy": 0.6679977881908417, + "epoch": 1.1688311688311688, + "grad_norm": 0.4456019401550293, + "learning_rate": 0.0002216012068086725, + "loss": 0.6189227294921875, + "mean_token_accuracy": 0.8257622331380844, + "num_tokens": 1149161.0, + "step": 450 + }, + { + "entropy": 0.6574469250440598, + "epoch": 1.2987012987012987, + "grad_norm": 0.3784359097480774, + "learning_rate": 0.00022119602267552194, + "loss": 0.6098381805419922, + "mean_token_accuracy": 0.8251238936185836, + "num_tokens": 1278079.0, + "step": 500 + }, + { + "entropy": 0.6593573099374771, + "epoch": 1.4285714285714286, + "grad_norm": 0.36508747935295105, + "learning_rate": 0.00022056419535323196, + "loss": 0.6137563705444335, + "mean_token_accuracy": 0.824503253698349, + "num_tokens": 1405326.0, + "step": 550 + }, + { + "entropy": 0.6459340593218803, + "epoch": 1.5584415584415585, + "grad_norm": 0.39773380756378174, + "learning_rate": 0.00021970702308872148, + "loss": 0.5986767196655274, + "mean_token_accuracy": 0.8270765203237533, + "num_tokens": 1532507.0, + "step": 600 + }, + { + "entropy": 0.633097175359726, + "epoch": 1.6883116883116882, + "grad_norm": 0.41731834411621094, + "learning_rate": 0.00021862626715633265, + "loss": 0.5849922180175782, + "mean_token_accuracy": 0.8313153618574143, + "num_tokens": 1662747.0, + "step": 650 + }, + { + "entropy": 0.6569280475378036, + "epoch": 1.8181818181818183, + "grad_norm": 0.3067665100097656, + "learning_rate": 0.00021732414823885307, + "loss": 0.605062370300293, + "mean_token_accuracy": 0.8269566106796264, + "num_tokens": 1783817.0, + "step": 700 + }, + { + "entropy": 0.6459099870920181, + "epoch": 1.948051948051948, + "grad_norm": 0.5508913397789001, + "learning_rate": 0.00021580334186456886, + "loss": 0.596672134399414, + "mean_token_accuracy": 0.8287447422742844, + "num_tokens": 1914573.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.6985598478752834, + "eval_loss": 0.6716415286064148, + "eval_mean_token_accuracy": 0.808078097895934, + "eval_num_tokens": 1963100.0, + "eval_runtime": 46.5197, + "eval_samples_per_second": 35.619, + "eval_steps_per_second": 4.471, + "step": 770 + }, + { + "entropy": 0.6113292586803436, + "epoch": 2.0779220779220777, + "grad_norm": 0.37614893913269043, + "learning_rate": 0.00021406697290972404, + "loss": 0.5562425231933594, + "mean_token_accuracy": 0.8369060623645782, + "num_tokens": 2035968.0, + "step": 800 + }, + { + "entropy": 0.5720089420676231, + "epoch": 2.207792207792208, + "grad_norm": 0.35560500621795654, + "learning_rate": 0.00021211860917768236, + "loss": 0.521314697265625, + "mean_token_accuracy": 0.8453685063123703, + "num_tokens": 2168591.0, + "step": 850 + }, + { + "entropy": 0.5757493850588798, + "epoch": 2.3376623376623376, + "grad_norm": 0.304592490196228, + "learning_rate": 0.00020996225406798486, + "loss": 0.5258681106567383, + "mean_token_accuracy": 0.8431038129329681, + "num_tokens": 2297967.0, + "step": 900 + }, + { + "entropy": 0.5790414094924927, + "epoch": 2.4675324675324677, + "grad_norm": 0.29762616753578186, + "learning_rate": 0.00020760233835036664, + "loss": 0.5219763565063477, + "mean_token_accuracy": 0.8438457292318344, + "num_tokens": 2425614.0, + "step": 950 + }, + { + "entropy": 0.5783660891652107, + "epoch": 2.5974025974025974, + "grad_norm": 0.3418065011501312, + "learning_rate": 0.00020504371106063417, + "loss": 0.5258687210083007, + "mean_token_accuracy": 0.8422278153896332, + "num_tokens": 2557421.0, + "step": 1000 + }, + { + "entropy": 0.5746926316618919, + "epoch": 2.7272727272727275, + "grad_norm": 0.3708217740058899, + "learning_rate": 0.00020229162953711157, + "loss": 0.5161260223388672, + "mean_token_accuracy": 0.8453844922780991, + "num_tokens": 2680379.0, + "step": 1050 + }, + { + "entropy": 0.5676264691352845, + "epoch": 2.857142857142857, + "grad_norm": 0.26185691356658936, + "learning_rate": 0.00019935174861812654, + "loss": 0.5179851913452148, + "mean_token_accuracy": 0.8455151951313019, + "num_tokens": 2810854.0, + "step": 1100 + }, + { + "entropy": 0.5804974803328514, + "epoch": 2.987012987012987, + "grad_norm": 0.3131248652935028, + "learning_rate": 0.00019623010902273397, + "loss": 0.5243957138061524, + "mean_token_accuracy": 0.8455932134389877, + "num_tokens": 2932637.0, + "step": 1150 + }, + { + "epoch": 3.0, + "eval_entropy": 0.6151215266436338, + "eval_loss": 0.6557295918464661, + "eval_mean_token_accuracy": 0.8159411993737404, + "eval_num_tokens": 2944650.0, + "eval_runtime": 46.5152, + "eval_samples_per_second": 35.623, + "eval_steps_per_second": 4.472, + "step": 1155 + }, + { + "entropy": 0.5100037640333176, + "epoch": 3.116883116883117, + "grad_norm": 0.32568028569221497, + "learning_rate": 0.00019293312493855094, + "loss": 0.4522856140136719, + "mean_token_accuracy": 0.8600019490718842, + "num_tokens": 3054046.0, + "step": 1200 + }, + { + "entropy": 0.5058341425657272, + "epoch": 3.2467532467532467, + "grad_norm": 0.3810749650001526, + "learning_rate": 0.00018946757084220762, + "loss": 0.44891536712646485, + "mean_token_accuracy": 0.8606845206022262, + "num_tokens": 3182515.0, + "step": 1250 + }, + { + "entropy": 0.5006153827905655, + "epoch": 3.3766233766233764, + "grad_norm": 0.35473746061325073, + "learning_rate": 0.0001858405675794941, + "loss": 0.451065673828125, + "mean_token_accuracy": 0.8608392387628555, + "num_tokens": 3313672.0, + "step": 1300 + }, + { + "entropy": 0.508549126982689, + "epoch": 3.5064935064935066, + "grad_norm": 0.3414103388786316, + "learning_rate": 0.00018205956773380578, + "loss": 0.4535030746459961, + "mean_token_accuracy": 0.859148946404457, + "num_tokens": 3440689.0, + "step": 1350 + }, + { + "entropy": 0.516265479028225, + "epoch": 3.6363636363636362, + "grad_norm": 0.4597613513469696, + "learning_rate": 0.00017813234031295068, + "loss": 0.45882129669189453, + "mean_token_accuracy": 0.8582911169528962, + "num_tokens": 3566427.0, + "step": 1400 + }, + { + "entropy": 0.4902383416891098, + "epoch": 3.7662337662337664, + "grad_norm": 0.3385583460330963, + "learning_rate": 0.0001740669547857841, + "loss": 0.4417523193359375, + "mean_token_accuracy": 0.8632881045341492, + "num_tokens": 3699976.0, + "step": 1450 + }, + { + "entropy": 0.5104234129190445, + "epoch": 3.896103896103896, + "grad_norm": 0.3935626149177551, + "learning_rate": 0.00016987176450147088, + "loss": 0.4547672653198242, + "mean_token_accuracy": 0.8592299193143844, + "num_tokens": 3827926.0, + "step": 1500 + }, + { + "epoch": 4.0, + "eval_entropy": 0.565094108478381, + "eval_loss": 0.6673083305358887, + "eval_mean_token_accuracy": 0.8179781915476689, + "eval_num_tokens": 3926200.0, + "eval_runtime": 46.4987, + "eval_samples_per_second": 35.635, + "eval_steps_per_second": 4.473, + "step": 1540 + }, + { + "entropy": 0.49345644742250444, + "epoch": 4.025974025974026, + "grad_norm": 0.42027363181114197, + "learning_rate": 0.00016555538952544487, + "loss": 0.4355708312988281, + "mean_token_accuracy": 0.8648384511470795, + "num_tokens": 3953109.0, + "step": 1550 + }, + { + "entropy": 0.4247111546993256, + "epoch": 4.1558441558441555, + "grad_norm": 0.4478093087673187, + "learning_rate": 0.00016112669892733307, + "loss": 0.36362716674804685, + "mean_token_accuracy": 0.8844931781291961, + "num_tokens": 4081079.0, + "step": 1600 + }, + { + "entropy": 0.42337420970201495, + "epoch": 4.285714285714286, + "grad_norm": 0.35812026262283325, + "learning_rate": 0.00015659479255723875, + "loss": 0.3651982498168945, + "mean_token_accuracy": 0.8830066406726838, + "num_tokens": 4206426.0, + "step": 1650 + }, + { + "entropy": 0.43247521698474883, + "epoch": 4.415584415584416, + "grad_norm": 0.3665122985839844, + "learning_rate": 0.0001519689823478283, + "loss": 0.37368186950683596, + "mean_token_accuracy": 0.8800795775651932, + "num_tokens": 4329839.0, + "step": 1700 + }, + { + "entropy": 0.4328634282946587, + "epoch": 4.545454545454545, + "grad_norm": 0.33961260318756104, + "learning_rate": 0.00014725877318064152, + "loss": 0.37599964141845704, + "mean_token_accuracy": 0.8797144430875778, + "num_tokens": 4453940.0, + "step": 1750 + }, + { + "entropy": 0.42742862343788146, + "epoch": 4.675324675324675, + "grad_norm": 0.4312469959259033, + "learning_rate": 0.0001424738433559405, + "loss": 0.37354656219482424, + "mean_token_accuracy": 0.880826217532158, + "num_tokens": 4583626.0, + "step": 1800 + }, + { + "entropy": 0.4280877533555031, + "epoch": 4.805194805194805, + "grad_norm": 0.4343482255935669, + "learning_rate": 0.0001376240247062263, + "loss": 0.3688441467285156, + "mean_token_accuracy": 0.8814105206727981, + "num_tokens": 4713540.0, + "step": 1850 + }, + { + "entropy": 0.42952129155397417, + "epoch": 4.935064935064935, + "grad_norm": 0.47131651639938354, + "learning_rate": 0.00013271928239428512, + "loss": 0.37270416259765626, + "mean_token_accuracy": 0.8788579875230789, + "num_tokens": 4845678.0, + "step": 1900 + }, + { + "epoch": 5.0, + "eval_entropy": 0.5064233084424183, + "eval_loss": 0.689122200012207, + "eval_mean_token_accuracy": 0.818150080453891, + "eval_num_tokens": 4907750.0, + "eval_runtime": 46.5114, + "eval_samples_per_second": 35.626, + "eval_steps_per_second": 4.472, + "step": 1925 + }, + { + "entropy": 0.3798492255806923, + "epoch": 5.064935064935065, + "grad_norm": 0.414468377828598, + "learning_rate": 0.0001277696944372747, + "loss": 0.31735713958740236, + "mean_token_accuracy": 0.8974772602319717, + "num_tokens": 4975955.0, + "step": 1950 + }, + { + "entropy": 0.3367840954661369, + "epoch": 5.194805194805195, + "grad_norm": 0.4515029191970825, + "learning_rate": 0.00012278543099892257, + "loss": 0.272756290435791, + "mean_token_accuracy": 0.9084931749105454, + "num_tokens": 5104770.0, + "step": 2000 + }, + { + "entropy": 0.34766108095645903, + "epoch": 5.324675324675325, + "grad_norm": 0.4040578007698059, + "learning_rate": 0.00011777673349238672, + "loss": 0.2792487144470215, + "mean_token_accuracy": 0.9065623581409454, + "num_tokens": 5229052.0, + "step": 2050 + }, + { + "entropy": 0.3566804251074791, + "epoch": 5.454545454545454, + "grad_norm": 0.5059600472450256, + "learning_rate": 0.00011275389353671628, + "loss": 0.2896596145629883, + "mean_token_accuracy": 0.9045185309648514, + "num_tokens": 5357753.0, + "step": 2100 + }, + { + "entropy": 0.3462292793393135, + "epoch": 5.584415584415584, + "grad_norm": 0.4664144217967987, + "learning_rate": 0.00010772723181015153, + "loss": 0.27794593811035156, + "mean_token_accuracy": 0.9075321304798126, + "num_tokens": 5481550.0, + "step": 2150 + }, + { + "entropy": 0.3434346827864647, + "epoch": 5.714285714285714, + "grad_norm": 0.46017780900001526, + "learning_rate": 0.00010270707684371499, + "loss": 0.2783885383605957, + "mean_token_accuracy": 0.9063384455442428, + "num_tokens": 5609104.0, + "step": 2200 + }, + { + "entropy": 0.341832632124424, + "epoch": 5.8441558441558445, + "grad_norm": 0.429608017206192, + "learning_rate": 9.77037437986665e-05, + "loss": 0.2815263748168945, + "mean_token_accuracy": 0.9060239523649216, + "num_tokens": 5737347.0, + "step": 2250 + }, + { + "entropy": 0.3492265248298645, + "epoch": 5.974025974025974, + "grad_norm": 0.5019266605377197, + "learning_rate": 9.272751327143021e-05, + "loss": 0.2844840621948242, + "mean_token_accuracy": 0.9042869365215301, + "num_tokens": 5861872.0, + "step": 2300 + }, + { + "epoch": 6.0, + "eval_entropy": 0.4365639974578069, + "eval_loss": 0.7624168395996094, + "eval_mean_token_accuracy": 0.8165808867376584, + "eval_num_tokens": 5889300.0, + "eval_runtime": 46.5221, + "eval_samples_per_second": 35.617, + "eval_steps_per_second": 4.471, + "step": 2310 + }, + { + "entropy": 0.28290177062153815, + "epoch": 6.103896103896104, + "grad_norm": 0.5457249283790588, + "learning_rate": 8.77886101695435e-05, + "loss": 0.2029383087158203, + "mean_token_accuracy": 0.9317537224292756, + "num_tokens": 5990679.0, + "step": 2350 + }, + { + "entropy": 0.2539422053098679, + "epoch": 6.233766233766234, + "grad_norm": 1.7734259366989136, + "learning_rate": 8.289718270203239e-05, + "loss": 0.1847425079345703, + "mean_token_accuracy": 0.937881036400795, + "num_tokens": 6117918.0, + "step": 2400 + }, + { + "entropy": 0.2555230759084225, + "epoch": 6.363636363636363, + "grad_norm": 0.49776697158813477, + "learning_rate": 7.806328152738371e-05, + "loss": 0.18783441543579102, + "mean_token_accuracy": 0.936203356385231, + "num_tokens": 6248566.0, + "step": 2450 + }, + { + "entropy": 0.2529813493788242, + "epoch": 6.4935064935064934, + "grad_norm": 0.4968299865722656, + "learning_rate": 7.32968391019587e-05, + "loss": 0.18458471298217774, + "mean_token_accuracy": 0.9365487760305404, + "num_tokens": 6380529.0, + "step": 2500 + }, + { + "entropy": 0.26623836129903794, + "epoch": 6.623376623376624, + "grad_norm": 0.6177894473075867, + "learning_rate": 6.860764927128271e-05, + "loss": 0.19328956604003905, + "mean_token_accuracy": 0.9330078029632568, + "num_tokens": 6501669.0, + "step": 2550 + }, + { + "entropy": 0.2671951600909233, + "epoch": 6.753246753246753, + "grad_norm": 0.6792670488357544, + "learning_rate": 6.400534714614501e-05, + "loss": 0.19405254364013672, + "mean_token_accuracy": 0.9335101181268692, + "num_tokens": 6624404.0, + "step": 2600 + }, + { + "entropy": 0.2522234851121902, + "epoch": 6.883116883116883, + "grad_norm": 0.4798950254917145, + "learning_rate": 5.949938930485951e-05, + "loss": 0.1846565818786621, + "mean_token_accuracy": 0.9369161009788514, + "num_tokens": 6755532.0, + "step": 2650 + }, + { + "epoch": 7.0, + "eval_entropy": 0.3764066012719503, + "eval_loss": 0.8554975390434265, + "eval_mean_token_accuracy": 0.8132733049301001, + "eval_num_tokens": 6870850.0, + "eval_runtime": 46.525, + "eval_samples_per_second": 35.615, + "eval_steps_per_second": 4.471, + "step": 2695 + }, + { + "entropy": 0.24576591402292253, + "epoch": 7.012987012987013, + "grad_norm": 0.3926205039024353, + "learning_rate": 5.5099034362364085e-05, + "loss": 0.1780208969116211, + "mean_token_accuracy": 0.9398036235570908, + "num_tokens": 6884338.0, + "step": 2700 + }, + { + "entropy": 0.18769787922501563, + "epoch": 7.142857142857143, + "grad_norm": 0.5062244534492493, + "learning_rate": 5.0813323946085895e-05, + "loss": 0.115099458694458, + "mean_token_accuracy": 0.961904166340828, + "num_tokens": 7008981.0, + "step": 2750 + }, + { + "entropy": 0.1834849800169468, + "epoch": 7.2727272727272725, + "grad_norm": 0.4319317638874054, + "learning_rate": 4.665106411766087e-05, + "loss": 0.11364558219909668, + "mean_token_accuracy": 0.9627783286571503, + "num_tokens": 7135402.0, + "step": 2800 + }, + { + "entropy": 0.17929002813994885, + "epoch": 7.402597402597403, + "grad_norm": 0.4122151732444763, + "learning_rate": 4.2620807278682855e-05, + "loss": 0.11115352630615234, + "mean_token_accuracy": 0.9625132656097413, + "num_tokens": 7265920.0, + "step": 2850 + }, + { + "entropy": 0.18764832600951195, + "epoch": 7.532467532467533, + "grad_norm": 0.4621254801750183, + "learning_rate": 3.873083459765971e-05, + "loss": 0.11578564643859864, + "mean_token_accuracy": 0.9611306923627854, + "num_tokens": 7389633.0, + "step": 2900 + }, + { + "entropy": 0.18220983803272248, + "epoch": 7.662337662337662, + "grad_norm": 0.45269420742988586, + "learning_rate": 3.498913899428605e-05, + "loss": 0.11399910926818847, + "mean_token_accuracy": 0.961864430308342, + "num_tokens": 7518696.0, + "step": 2950 + }, + { + "entropy": 0.18482646018266677, + "epoch": 7.792207792207792, + "grad_norm": 0.4839811325073242, + "learning_rate": 3.1403408715994884e-05, + "loss": 0.11555064201354981, + "mean_token_accuracy": 0.9611130750179291, + "num_tokens": 7643525.0, + "step": 3000 + }, + { + "entropy": 0.18274843357503415, + "epoch": 7.922077922077922, + "grad_norm": 0.46752479672431946, + "learning_rate": 2.798101154053465e-05, + "loss": 0.11180784225463868, + "mean_token_accuracy": 0.9621868497133255, + "num_tokens": 7773494.0, + "step": 3050 + }, + { + "epoch": 8.0, + "eval_entropy": 0.32566706384890354, + "eval_loss": 0.9837347865104675, + "eval_mean_token_accuracy": 0.8104732755858165, + "eval_num_tokens": 7852400.0, + "eval_runtime": 46.5488, + "eval_samples_per_second": 35.597, + "eval_steps_per_second": 4.468, + "step": 3080 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.3542904203667354e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e4d42893a69e354223672a3b0f9de6f2b979c004 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05284766149829996, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d3c2c962325ef43efd7ef06e9ad3a6df8d17f877 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/trainer_state.json @@ -0,0 +1,823 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.0, + "eval_steps": 500, + "global_step": 3465, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.8529596650600433, + "epoch": 0.12987012987012986, + "grad_norm": 0.878183901309967, + "learning_rate": 2.8227544379109735e-05, + "loss": 1.762685546875, + "mean_token_accuracy": 0.6396001759171486, + "num_tokens": 133878.0, + "step": 50 + }, + { + "entropy": 0.9309259909391403, + "epoch": 0.2597402597402597, + "grad_norm": 0.6279756426811218, + "learning_rate": 5.7031161092487016e-05, + "loss": 0.8922532653808594, + "mean_token_accuracy": 0.768746777176857, + "num_tokens": 265533.0, + "step": 100 + }, + { + "entropy": 0.8268384379148483, + "epoch": 0.38961038961038963, + "grad_norm": 0.5478948354721069, + "learning_rate": 8.583477780586432e-05, + "loss": 0.7756452178955078, + "mean_token_accuracy": 0.7925205600261688, + "num_tokens": 386480.0, + "step": 150 + }, + { + "entropy": 0.7658095824718475, + "epoch": 0.5194805194805194, + "grad_norm": 0.5972853899002075, + "learning_rate": 0.00011463839451924158, + "loss": 0.7161135101318359, + "mean_token_accuracy": 0.8019153982400894, + "num_tokens": 520135.0, + "step": 200 + }, + { + "entropy": 0.769344270825386, + "epoch": 0.6493506493506493, + "grad_norm": 0.5576531887054443, + "learning_rate": 0.0001434420112326189, + "loss": 0.716611557006836, + "mean_token_accuracy": 0.8028716742992401, + "num_tokens": 640303.0, + "step": 250 + }, + { + "entropy": 0.7470700722932816, + "epoch": 0.7792207792207793, + "grad_norm": 0.3937978148460388, + "learning_rate": 0.0001722456279459962, + "loss": 0.6983388519287109, + "mean_token_accuracy": 0.80596988260746, + "num_tokens": 766027.0, + "step": 300 + }, + { + "entropy": 0.7172233510017395, + "epoch": 0.9090909090909091, + "grad_norm": 0.32564300298690796, + "learning_rate": 0.00020104924465937345, + "loss": 0.6731016540527344, + "mean_token_accuracy": 0.8119878542423248, + "num_tokens": 894871.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.7397788639825124, + "eval_loss": 0.7304292917251587, + "eval_mean_token_accuracy": 0.7976073359067624, + "eval_num_tokens": 981550.0, + "eval_runtime": 46.5688, + "eval_samples_per_second": 35.582, + "eval_steps_per_second": 4.467, + "step": 385 + }, + { + "entropy": 0.7035581320524216, + "epoch": 1.0389610389610389, + "grad_norm": 0.45747122168540955, + "learning_rate": 0.00022177891520076015, + "loss": 0.6450813293457032, + "mean_token_accuracy": 0.8178210550546646, + "num_tokens": 1021041.0, + "step": 400 + }, + { + "entropy": 0.6679977881908417, + "epoch": 1.1688311688311688, + "grad_norm": 0.4456019401550293, + "learning_rate": 0.0002216012068086725, + "loss": 0.6189227294921875, + "mean_token_accuracy": 0.8257622331380844, + "num_tokens": 1149161.0, + "step": 450 + }, + { + "entropy": 0.6574469250440598, + "epoch": 1.2987012987012987, + "grad_norm": 0.3784359097480774, + "learning_rate": 0.00022119602267552194, + "loss": 0.6098381805419922, + "mean_token_accuracy": 0.8251238936185836, + "num_tokens": 1278079.0, + "step": 500 + }, + { + "entropy": 0.6593573099374771, + "epoch": 1.4285714285714286, + "grad_norm": 0.36508747935295105, + "learning_rate": 0.00022056419535323196, + "loss": 0.6137563705444335, + "mean_token_accuracy": 0.824503253698349, + "num_tokens": 1405326.0, + "step": 550 + }, + { + "entropy": 0.6459340593218803, + "epoch": 1.5584415584415585, + "grad_norm": 0.39773380756378174, + "learning_rate": 0.00021970702308872148, + "loss": 0.5986767196655274, + "mean_token_accuracy": 0.8270765203237533, + "num_tokens": 1532507.0, + "step": 600 + }, + { + "entropy": 0.633097175359726, + "epoch": 1.6883116883116882, + "grad_norm": 0.41731834411621094, + "learning_rate": 0.00021862626715633265, + "loss": 0.5849922180175782, + "mean_token_accuracy": 0.8313153618574143, + "num_tokens": 1662747.0, + "step": 650 + }, + { + "entropy": 0.6569280475378036, + "epoch": 1.8181818181818183, + "grad_norm": 0.3067665100097656, + "learning_rate": 0.00021732414823885307, + "loss": 0.605062370300293, + "mean_token_accuracy": 0.8269566106796264, + "num_tokens": 1783817.0, + "step": 700 + }, + { + "entropy": 0.6459099870920181, + "epoch": 1.948051948051948, + "grad_norm": 0.5508913397789001, + "learning_rate": 0.00021580334186456886, + "loss": 0.596672134399414, + "mean_token_accuracy": 0.8287447422742844, + "num_tokens": 1914573.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.6985598478752834, + "eval_loss": 0.6716415286064148, + "eval_mean_token_accuracy": 0.808078097895934, + "eval_num_tokens": 1963100.0, + "eval_runtime": 46.5197, + "eval_samples_per_second": 35.619, + "eval_steps_per_second": 4.471, + "step": 770 + }, + { + "entropy": 0.6113292586803436, + "epoch": 2.0779220779220777, + "grad_norm": 0.37614893913269043, + "learning_rate": 0.00021406697290972404, + "loss": 0.5562425231933594, + "mean_token_accuracy": 0.8369060623645782, + "num_tokens": 2035968.0, + "step": 800 + }, + { + "entropy": 0.5720089420676231, + "epoch": 2.207792207792208, + "grad_norm": 0.35560500621795654, + "learning_rate": 0.00021211860917768236, + "loss": 0.521314697265625, + "mean_token_accuracy": 0.8453685063123703, + "num_tokens": 2168591.0, + "step": 850 + }, + { + "entropy": 0.5757493850588798, + "epoch": 2.3376623376623376, + "grad_norm": 0.304592490196228, + "learning_rate": 0.00020996225406798486, + "loss": 0.5258681106567383, + "mean_token_accuracy": 0.8431038129329681, + "num_tokens": 2297967.0, + "step": 900 + }, + { + "entropy": 0.5790414094924927, + "epoch": 2.4675324675324677, + "grad_norm": 0.29762616753578186, + "learning_rate": 0.00020760233835036664, + "loss": 0.5219763565063477, + "mean_token_accuracy": 0.8438457292318344, + "num_tokens": 2425614.0, + "step": 950 + }, + { + "entropy": 0.5783660891652107, + "epoch": 2.5974025974025974, + "grad_norm": 0.3418065011501312, + "learning_rate": 0.00020504371106063417, + "loss": 0.5258687210083007, + "mean_token_accuracy": 0.8422278153896332, + "num_tokens": 2557421.0, + "step": 1000 + }, + { + "entropy": 0.5746926316618919, + "epoch": 2.7272727272727275, + "grad_norm": 0.3708217740058899, + "learning_rate": 0.00020229162953711157, + "loss": 0.5161260223388672, + "mean_token_accuracy": 0.8453844922780991, + "num_tokens": 2680379.0, + "step": 1050 + }, + { + "entropy": 0.5676264691352845, + "epoch": 2.857142857142857, + "grad_norm": 0.26185691356658936, + "learning_rate": 0.00019935174861812654, + "loss": 0.5179851913452148, + "mean_token_accuracy": 0.8455151951313019, + "num_tokens": 2810854.0, + "step": 1100 + }, + { + "entropy": 0.5804974803328514, + "epoch": 2.987012987012987, + "grad_norm": 0.3131248652935028, + "learning_rate": 0.00019623010902273397, + "loss": 0.5243957138061524, + "mean_token_accuracy": 0.8455932134389877, + "num_tokens": 2932637.0, + "step": 1150 + }, + { + "epoch": 3.0, + "eval_entropy": 0.6151215266436338, + "eval_loss": 0.6557295918464661, + "eval_mean_token_accuracy": 0.8159411993737404, + "eval_num_tokens": 2944650.0, + "eval_runtime": 46.5152, + "eval_samples_per_second": 35.623, + "eval_steps_per_second": 4.472, + "step": 1155 + }, + { + "entropy": 0.5100037640333176, + "epoch": 3.116883116883117, + "grad_norm": 0.32568028569221497, + "learning_rate": 0.00019293312493855094, + "loss": 0.4522856140136719, + "mean_token_accuracy": 0.8600019490718842, + "num_tokens": 3054046.0, + "step": 1200 + }, + { + "entropy": 0.5058341425657272, + "epoch": 3.2467532467532467, + "grad_norm": 0.3810749650001526, + "learning_rate": 0.00018946757084220762, + "loss": 0.44891536712646485, + "mean_token_accuracy": 0.8606845206022262, + "num_tokens": 3182515.0, + "step": 1250 + }, + { + "entropy": 0.5006153827905655, + "epoch": 3.3766233766233764, + "grad_norm": 0.35473746061325073, + "learning_rate": 0.0001858405675794941, + "loss": 0.451065673828125, + "mean_token_accuracy": 0.8608392387628555, + "num_tokens": 3313672.0, + "step": 1300 + }, + { + "entropy": 0.508549126982689, + "epoch": 3.5064935064935066, + "grad_norm": 0.3414103388786316, + "learning_rate": 0.00018205956773380578, + "loss": 0.4535030746459961, + "mean_token_accuracy": 0.859148946404457, + "num_tokens": 3440689.0, + "step": 1350 + }, + { + "entropy": 0.516265479028225, + "epoch": 3.6363636363636362, + "grad_norm": 0.4597613513469696, + "learning_rate": 0.00017813234031295068, + "loss": 0.45882129669189453, + "mean_token_accuracy": 0.8582911169528962, + "num_tokens": 3566427.0, + "step": 1400 + }, + { + "entropy": 0.4902383416891098, + "epoch": 3.7662337662337664, + "grad_norm": 0.3385583460330963, + "learning_rate": 0.0001740669547857841, + "loss": 0.4417523193359375, + "mean_token_accuracy": 0.8632881045341492, + "num_tokens": 3699976.0, + "step": 1450 + }, + { + "entropy": 0.5104234129190445, + "epoch": 3.896103896103896, + "grad_norm": 0.3935626149177551, + "learning_rate": 0.00016987176450147088, + "loss": 0.4547672653198242, + "mean_token_accuracy": 0.8592299193143844, + "num_tokens": 3827926.0, + "step": 1500 + }, + { + "epoch": 4.0, + "eval_entropy": 0.565094108478381, + "eval_loss": 0.6673083305358887, + "eval_mean_token_accuracy": 0.8179781915476689, + "eval_num_tokens": 3926200.0, + "eval_runtime": 46.4987, + "eval_samples_per_second": 35.635, + "eval_steps_per_second": 4.473, + "step": 1540 + }, + { + "entropy": 0.49345644742250444, + "epoch": 4.025974025974026, + "grad_norm": 0.42027363181114197, + "learning_rate": 0.00016555538952544487, + "loss": 0.4355708312988281, + "mean_token_accuracy": 0.8648384511470795, + "num_tokens": 3953109.0, + "step": 1550 + }, + { + "entropy": 0.4247111546993256, + "epoch": 4.1558441558441555, + "grad_norm": 0.4478093087673187, + "learning_rate": 0.00016112669892733307, + "loss": 0.36362716674804685, + "mean_token_accuracy": 0.8844931781291961, + "num_tokens": 4081079.0, + "step": 1600 + }, + { + "entropy": 0.42337420970201495, + "epoch": 4.285714285714286, + "grad_norm": 0.35812026262283325, + "learning_rate": 0.00015659479255723875, + "loss": 0.3651982498168945, + "mean_token_accuracy": 0.8830066406726838, + "num_tokens": 4206426.0, + "step": 1650 + }, + { + "entropy": 0.43247521698474883, + "epoch": 4.415584415584416, + "grad_norm": 0.3665122985839844, + "learning_rate": 0.0001519689823478283, + "loss": 0.37368186950683596, + "mean_token_accuracy": 0.8800795775651932, + "num_tokens": 4329839.0, + "step": 1700 + }, + { + "entropy": 0.4328634282946587, + "epoch": 4.545454545454545, + "grad_norm": 0.33961260318756104, + "learning_rate": 0.00014725877318064152, + "loss": 0.37599964141845704, + "mean_token_accuracy": 0.8797144430875778, + "num_tokens": 4453940.0, + "step": 1750 + }, + { + "entropy": 0.42742862343788146, + "epoch": 4.675324675324675, + "grad_norm": 0.4312469959259033, + "learning_rate": 0.0001424738433559405, + "loss": 0.37354656219482424, + "mean_token_accuracy": 0.880826217532158, + "num_tokens": 4583626.0, + "step": 1800 + }, + { + "entropy": 0.4280877533555031, + "epoch": 4.805194805194805, + "grad_norm": 0.4343482255935669, + "learning_rate": 0.0001376240247062263, + "loss": 0.3688441467285156, + "mean_token_accuracy": 0.8814105206727981, + "num_tokens": 4713540.0, + "step": 1850 + }, + { + "entropy": 0.42952129155397417, + "epoch": 4.935064935064935, + "grad_norm": 0.47131651639938354, + "learning_rate": 0.00013271928239428512, + "loss": 0.37270416259765626, + "mean_token_accuracy": 0.8788579875230789, + "num_tokens": 4845678.0, + "step": 1900 + }, + { + "epoch": 5.0, + "eval_entropy": 0.5064233084424183, + "eval_loss": 0.689122200012207, + "eval_mean_token_accuracy": 0.818150080453891, + "eval_num_tokens": 4907750.0, + "eval_runtime": 46.5114, + "eval_samples_per_second": 35.626, + "eval_steps_per_second": 4.472, + "step": 1925 + }, + { + "entropy": 0.3798492255806923, + "epoch": 5.064935064935065, + "grad_norm": 0.414468377828598, + "learning_rate": 0.0001277696944372747, + "loss": 0.31735713958740236, + "mean_token_accuracy": 0.8974772602319717, + "num_tokens": 4975955.0, + "step": 1950 + }, + { + "entropy": 0.3367840954661369, + "epoch": 5.194805194805195, + "grad_norm": 0.4515029191970825, + "learning_rate": 0.00012278543099892257, + "loss": 0.272756290435791, + "mean_token_accuracy": 0.9084931749105454, + "num_tokens": 5104770.0, + "step": 2000 + }, + { + "entropy": 0.34766108095645903, + "epoch": 5.324675324675325, + "grad_norm": 0.4040578007698059, + "learning_rate": 0.00011777673349238672, + "loss": 0.2792487144470215, + "mean_token_accuracy": 0.9065623581409454, + "num_tokens": 5229052.0, + "step": 2050 + }, + { + "entropy": 0.3566804251074791, + "epoch": 5.454545454545454, + "grad_norm": 0.5059600472450256, + "learning_rate": 0.00011275389353671628, + "loss": 0.2896596145629883, + "mean_token_accuracy": 0.9045185309648514, + "num_tokens": 5357753.0, + "step": 2100 + }, + { + "entropy": 0.3462292793393135, + "epoch": 5.584415584415584, + "grad_norm": 0.4664144217967987, + "learning_rate": 0.00010772723181015153, + "loss": 0.27794593811035156, + "mean_token_accuracy": 0.9075321304798126, + "num_tokens": 5481550.0, + "step": 2150 + }, + { + "entropy": 0.3434346827864647, + "epoch": 5.714285714285714, + "grad_norm": 0.46017780900001526, + "learning_rate": 0.00010270707684371499, + "loss": 0.2783885383605957, + "mean_token_accuracy": 0.9063384455442428, + "num_tokens": 5609104.0, + "step": 2200 + }, + { + "entropy": 0.341832632124424, + "epoch": 5.8441558441558445, + "grad_norm": 0.429608017206192, + "learning_rate": 9.77037437986665e-05, + "loss": 0.2815263748168945, + "mean_token_accuracy": 0.9060239523649216, + "num_tokens": 5737347.0, + "step": 2250 + }, + { + "entropy": 0.3492265248298645, + "epoch": 5.974025974025974, + "grad_norm": 0.5019266605377197, + "learning_rate": 9.272751327143021e-05, + "loss": 0.2844840621948242, + "mean_token_accuracy": 0.9042869365215301, + "num_tokens": 5861872.0, + "step": 2300 + }, + { + "epoch": 6.0, + "eval_entropy": 0.4365639974578069, + "eval_loss": 0.7624168395996094, + "eval_mean_token_accuracy": 0.8165808867376584, + "eval_num_tokens": 5889300.0, + "eval_runtime": 46.5221, + "eval_samples_per_second": 35.617, + "eval_steps_per_second": 4.471, + "step": 2310 + }, + { + "entropy": 0.28290177062153815, + "epoch": 6.103896103896104, + "grad_norm": 0.5457249283790588, + "learning_rate": 8.77886101695435e-05, + "loss": 0.2029383087158203, + "mean_token_accuracy": 0.9317537224292756, + "num_tokens": 5990679.0, + "step": 2350 + }, + { + "entropy": 0.2539422053098679, + "epoch": 6.233766233766234, + "grad_norm": 1.7734259366989136, + "learning_rate": 8.289718270203239e-05, + "loss": 0.1847425079345703, + "mean_token_accuracy": 0.937881036400795, + "num_tokens": 6117918.0, + "step": 2400 + }, + { + "entropy": 0.2555230759084225, + "epoch": 6.363636363636363, + "grad_norm": 0.49776697158813477, + "learning_rate": 7.806328152738371e-05, + "loss": 0.18783441543579102, + "mean_token_accuracy": 0.936203356385231, + "num_tokens": 6248566.0, + "step": 2450 + }, + { + "entropy": 0.2529813493788242, + "epoch": 6.4935064935064934, + "grad_norm": 0.4968299865722656, + "learning_rate": 7.32968391019587e-05, + "loss": 0.18458471298217774, + "mean_token_accuracy": 0.9365487760305404, + "num_tokens": 6380529.0, + "step": 2500 + }, + { + "entropy": 0.26623836129903794, + "epoch": 6.623376623376624, + "grad_norm": 0.6177894473075867, + "learning_rate": 6.860764927128271e-05, + "loss": 0.19328956604003905, + "mean_token_accuracy": 0.9330078029632568, + "num_tokens": 6501669.0, + "step": 2550 + }, + { + "entropy": 0.2671951600909233, + "epoch": 6.753246753246753, + "grad_norm": 0.6792670488357544, + "learning_rate": 6.400534714614501e-05, + "loss": 0.19405254364013672, + "mean_token_accuracy": 0.9335101181268692, + "num_tokens": 6624404.0, + "step": 2600 + }, + { + "entropy": 0.2522234851121902, + "epoch": 6.883116883116883, + "grad_norm": 0.4798950254917145, + "learning_rate": 5.949938930485951e-05, + "loss": 0.1846565818786621, + "mean_token_accuracy": 0.9369161009788514, + "num_tokens": 6755532.0, + "step": 2650 + }, + { + "epoch": 7.0, + "eval_entropy": 0.3764066012719503, + "eval_loss": 0.8554975390434265, + "eval_mean_token_accuracy": 0.8132733049301001, + "eval_num_tokens": 6870850.0, + "eval_runtime": 46.525, + "eval_samples_per_second": 35.615, + "eval_steps_per_second": 4.471, + "step": 2695 + }, + { + "entropy": 0.24576591402292253, + "epoch": 7.012987012987013, + "grad_norm": 0.3926205039024353, + "learning_rate": 5.5099034362364085e-05, + "loss": 0.1780208969116211, + "mean_token_accuracy": 0.9398036235570908, + "num_tokens": 6884338.0, + "step": 2700 + }, + { + "entropy": 0.18769787922501563, + "epoch": 7.142857142857143, + "grad_norm": 0.5062244534492493, + "learning_rate": 5.0813323946085895e-05, + "loss": 0.115099458694458, + "mean_token_accuracy": 0.961904166340828, + "num_tokens": 7008981.0, + "step": 2750 + }, + { + "entropy": 0.1834849800169468, + "epoch": 7.2727272727272725, + "grad_norm": 0.4319317638874054, + "learning_rate": 4.665106411766087e-05, + "loss": 0.11364558219909668, + "mean_token_accuracy": 0.9627783286571503, + "num_tokens": 7135402.0, + "step": 2800 + }, + { + "entropy": 0.17929002813994885, + "epoch": 7.402597402597403, + "grad_norm": 0.4122151732444763, + "learning_rate": 4.2620807278682855e-05, + "loss": 0.11115352630615234, + "mean_token_accuracy": 0.9625132656097413, + "num_tokens": 7265920.0, + "step": 2850 + }, + { + "entropy": 0.18764832600951195, + "epoch": 7.532467532467533, + "grad_norm": 0.4621254801750183, + "learning_rate": 3.873083459765971e-05, + "loss": 0.11578564643859864, + "mean_token_accuracy": 0.9611306923627854, + "num_tokens": 7389633.0, + "step": 2900 + }, + { + "entropy": 0.18220983803272248, + "epoch": 7.662337662337662, + "grad_norm": 0.45269420742988586, + "learning_rate": 3.498913899428605e-05, + "loss": 0.11399910926818847, + "mean_token_accuracy": 0.961864430308342, + "num_tokens": 7518696.0, + "step": 2950 + }, + { + "entropy": 0.18482646018266677, + "epoch": 7.792207792207792, + "grad_norm": 0.4839811325073242, + "learning_rate": 3.1403408715994884e-05, + "loss": 0.11555064201354981, + "mean_token_accuracy": 0.9611130750179291, + "num_tokens": 7643525.0, + "step": 3000 + }, + { + "entropy": 0.18274843357503415, + "epoch": 7.922077922077922, + "grad_norm": 0.46752479672431946, + "learning_rate": 2.798101154053465e-05, + "loss": 0.11180784225463868, + "mean_token_accuracy": 0.9621868497133255, + "num_tokens": 7773494.0, + "step": 3050 + }, + { + "epoch": 8.0, + "eval_entropy": 0.32566706384890354, + "eval_loss": 0.9837347865104675, + "eval_mean_token_accuracy": 0.8104732755858165, + "eval_num_tokens": 7852400.0, + "eval_runtime": 46.5488, + "eval_samples_per_second": 35.597, + "eval_steps_per_second": 4.468, + "step": 3080 + }, + { + "entropy": 0.1633647498488426, + "epoch": 8.051948051948052, + "grad_norm": 0.4305579960346222, + "learning_rate": 2.472897963703081e-05, + "loss": 0.09586874961853027, + "mean_token_accuracy": 0.9682459622621536, + "num_tokens": 7903348.0, + "step": 3100 + }, + { + "entropy": 0.14545966424047946, + "epoch": 8.181818181818182, + "grad_norm": 0.3315879702568054, + "learning_rate": 2.1653995116639546e-05, + "loss": 0.07627681255340577, + "mean_token_accuracy": 0.9752496027946472, + "num_tokens": 8028778.0, + "step": 3150 + }, + { + "entropy": 0.14618608497083188, + "epoch": 8.311688311688311, + "grad_norm": 0.30432993173599243, + "learning_rate": 1.876237630248263e-05, + "loss": 0.07688333988189697, + "mean_token_accuracy": 0.9748979198932648, + "num_tokens": 8153699.0, + "step": 3200 + }, + { + "entropy": 0.14237778432667256, + "epoch": 8.441558441558442, + "grad_norm": 0.299809068441391, + "learning_rate": 1.606006474707584e-05, + "loss": 0.07612751007080078, + "mean_token_accuracy": 0.9753290069103241, + "num_tokens": 8281408.0, + "step": 3250 + }, + { + "entropy": 0.14949887059628963, + "epoch": 8.571428571428571, + "grad_norm": 0.4242253601551056, + "learning_rate": 1.355261302392631e-05, + "loss": 0.08111579895019531, + "mean_token_accuracy": 0.9737774491310119, + "num_tokens": 8400657.0, + "step": 3300 + }, + { + "entropy": 0.1369019091874361, + "epoch": 8.7012987012987, + "grad_norm": 0.2756560742855072, + "learning_rate": 1.1245173318384599e-05, + "loss": 0.07382246494293213, + "mean_token_accuracy": 0.9767089641094208, + "num_tokens": 8533080.0, + "step": 3350 + }, + { + "entropy": 0.13799229875206948, + "epoch": 8.831168831168831, + "grad_norm": 0.32391059398651123, + "learning_rate": 9.14248684119404e-06, + "loss": 0.07332521915435791, + "mean_token_accuracy": 0.975739398598671, + "num_tokens": 8664922.0, + "step": 3400 + }, + { + "entropy": 0.13693079218268395, + "epoch": 8.96103896103896, + "grad_norm": 0.3435397446155548, + "learning_rate": 7.248874086490063e-06, + "loss": 0.0737720012664795, + "mean_token_accuracy": 0.9756388676166534, + "num_tokens": 8797714.0, + "step": 3450 + }, + { + "epoch": 9.0, + "eval_entropy": 0.2884131854113478, + "eval_loss": 1.1368393898010254, + "eval_mean_token_accuracy": 0.8096811464772775, + "eval_num_tokens": 8833950.0, + "eval_runtime": 46.5492, + "eval_samples_per_second": 35.597, + "eval_steps_per_second": 4.468, + "step": 3465 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.8981730000368845e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e4d42893a69e354223672a3b0f9de6f2b979c004 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05284766149829996, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3689945836de870b77e8a2338a315e50eafcb091 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/trainer_state.json @@ -0,0 +1,115 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 385, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.8529596650600433, + "epoch": 0.12987012987012986, + "grad_norm": 0.878183901309967, + "learning_rate": 2.8227544379109735e-05, + "loss": 1.762685546875, + "mean_token_accuracy": 0.6396001759171486, + "num_tokens": 133878.0, + "step": 50 + }, + { + "entropy": 0.9309259909391403, + "epoch": 0.2597402597402597, + "grad_norm": 0.6279756426811218, + "learning_rate": 5.7031161092487016e-05, + "loss": 0.8922532653808594, + "mean_token_accuracy": 0.768746777176857, + "num_tokens": 265533.0, + "step": 100 + }, + { + "entropy": 0.8268384379148483, + "epoch": 0.38961038961038963, + "grad_norm": 0.5478948354721069, + "learning_rate": 8.583477780586432e-05, + "loss": 0.7756452178955078, + "mean_token_accuracy": 0.7925205600261688, + "num_tokens": 386480.0, + "step": 150 + }, + { + "entropy": 0.7658095824718475, + "epoch": 0.5194805194805194, + "grad_norm": 0.5972853899002075, + "learning_rate": 0.00011463839451924158, + "loss": 0.7161135101318359, + "mean_token_accuracy": 0.8019153982400894, + "num_tokens": 520135.0, + "step": 200 + }, + { + "entropy": 0.769344270825386, + "epoch": 0.6493506493506493, + "grad_norm": 0.5576531887054443, + "learning_rate": 0.0001434420112326189, + "loss": 0.716611557006836, + "mean_token_accuracy": 0.8028716742992401, + "num_tokens": 640303.0, + "step": 250 + }, + { + "entropy": 0.7470700722932816, + "epoch": 0.7792207792207793, + "grad_norm": 0.3937978148460388, + "learning_rate": 0.0001722456279459962, + "loss": 0.6983388519287109, + "mean_token_accuracy": 0.80596988260746, + "num_tokens": 766027.0, + "step": 300 + }, + { + "entropy": 0.7172233510017395, + "epoch": 0.9090909090909091, + "grad_norm": 0.32564300298690796, + "learning_rate": 0.00020104924465937345, + "loss": 0.6731016540527344, + "mean_token_accuracy": 0.8119878542423248, + "num_tokens": 894871.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.7397788639825124, + "eval_loss": 0.7304292917251587, + "eval_mean_token_accuracy": 0.7976073359067624, + "eval_num_tokens": 981550.0, + "eval_runtime": 46.5688, + "eval_samples_per_second": 35.582, + "eval_steps_per_second": 4.467, + "step": 385 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.429451631126118e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e4d42893a69e354223672a3b0f9de6f2b979c004 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05284766149829996, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b04683da5d407a815bb860c3c5a1b481e272edd7 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/trainer_state.json @@ -0,0 +1,914 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 500, + "global_step": 3850, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.8529596650600433, + "epoch": 0.12987012987012986, + "grad_norm": 0.878183901309967, + "learning_rate": 2.8227544379109735e-05, + "loss": 1.762685546875, + "mean_token_accuracy": 0.6396001759171486, + "num_tokens": 133878.0, + "step": 50 + }, + { + "entropy": 0.9309259909391403, + "epoch": 0.2597402597402597, + "grad_norm": 0.6279756426811218, + "learning_rate": 5.7031161092487016e-05, + "loss": 0.8922532653808594, + "mean_token_accuracy": 0.768746777176857, + "num_tokens": 265533.0, + "step": 100 + }, + { + "entropy": 0.8268384379148483, + "epoch": 0.38961038961038963, + "grad_norm": 0.5478948354721069, + "learning_rate": 8.583477780586432e-05, + "loss": 0.7756452178955078, + "mean_token_accuracy": 0.7925205600261688, + "num_tokens": 386480.0, + "step": 150 + }, + { + "entropy": 0.7658095824718475, + "epoch": 0.5194805194805194, + "grad_norm": 0.5972853899002075, + "learning_rate": 0.00011463839451924158, + "loss": 0.7161135101318359, + "mean_token_accuracy": 0.8019153982400894, + "num_tokens": 520135.0, + "step": 200 + }, + { + "entropy": 0.769344270825386, + "epoch": 0.6493506493506493, + "grad_norm": 0.5576531887054443, + "learning_rate": 0.0001434420112326189, + "loss": 0.716611557006836, + "mean_token_accuracy": 0.8028716742992401, + "num_tokens": 640303.0, + "step": 250 + }, + { + "entropy": 0.7470700722932816, + "epoch": 0.7792207792207793, + "grad_norm": 0.3937978148460388, + "learning_rate": 0.0001722456279459962, + "loss": 0.6983388519287109, + "mean_token_accuracy": 0.80596988260746, + "num_tokens": 766027.0, + "step": 300 + }, + { + "entropy": 0.7172233510017395, + "epoch": 0.9090909090909091, + "grad_norm": 0.32564300298690796, + "learning_rate": 0.00020104924465937345, + "loss": 0.6731016540527344, + "mean_token_accuracy": 0.8119878542423248, + "num_tokens": 894871.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.7397788639825124, + "eval_loss": 0.7304292917251587, + "eval_mean_token_accuracy": 0.7976073359067624, + "eval_num_tokens": 981550.0, + "eval_runtime": 46.5688, + "eval_samples_per_second": 35.582, + "eval_steps_per_second": 4.467, + "step": 385 + }, + { + "entropy": 0.7035581320524216, + "epoch": 1.0389610389610389, + "grad_norm": 0.45747122168540955, + "learning_rate": 0.00022177891520076015, + "loss": 0.6450813293457032, + "mean_token_accuracy": 0.8178210550546646, + "num_tokens": 1021041.0, + "step": 400 + }, + { + "entropy": 0.6679977881908417, + "epoch": 1.1688311688311688, + "grad_norm": 0.4456019401550293, + "learning_rate": 0.0002216012068086725, + "loss": 0.6189227294921875, + "mean_token_accuracy": 0.8257622331380844, + "num_tokens": 1149161.0, + "step": 450 + }, + { + "entropy": 0.6574469250440598, + "epoch": 1.2987012987012987, + "grad_norm": 0.3784359097480774, + "learning_rate": 0.00022119602267552194, + "loss": 0.6098381805419922, + "mean_token_accuracy": 0.8251238936185836, + "num_tokens": 1278079.0, + "step": 500 + }, + { + "entropy": 0.6593573099374771, + "epoch": 1.4285714285714286, + "grad_norm": 0.36508747935295105, + "learning_rate": 0.00022056419535323196, + "loss": 0.6137563705444335, + "mean_token_accuracy": 0.824503253698349, + "num_tokens": 1405326.0, + "step": 550 + }, + { + "entropy": 0.6459340593218803, + "epoch": 1.5584415584415585, + "grad_norm": 0.39773380756378174, + "learning_rate": 0.00021970702308872148, + "loss": 0.5986767196655274, + "mean_token_accuracy": 0.8270765203237533, + "num_tokens": 1532507.0, + "step": 600 + }, + { + "entropy": 0.633097175359726, + "epoch": 1.6883116883116882, + "grad_norm": 0.41731834411621094, + "learning_rate": 0.00021862626715633265, + "loss": 0.5849922180175782, + "mean_token_accuracy": 0.8313153618574143, + "num_tokens": 1662747.0, + "step": 650 + }, + { + "entropy": 0.6569280475378036, + "epoch": 1.8181818181818183, + "grad_norm": 0.3067665100097656, + "learning_rate": 0.00021732414823885307, + "loss": 0.605062370300293, + "mean_token_accuracy": 0.8269566106796264, + "num_tokens": 1783817.0, + "step": 700 + }, + { + "entropy": 0.6459099870920181, + "epoch": 1.948051948051948, + "grad_norm": 0.5508913397789001, + "learning_rate": 0.00021580334186456886, + "loss": 0.596672134399414, + "mean_token_accuracy": 0.8287447422742844, + "num_tokens": 1914573.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.6985598478752834, + "eval_loss": 0.6716415286064148, + "eval_mean_token_accuracy": 0.808078097895934, + "eval_num_tokens": 1963100.0, + "eval_runtime": 46.5197, + "eval_samples_per_second": 35.619, + "eval_steps_per_second": 4.471, + "step": 770 + }, + { + "entropy": 0.6113292586803436, + "epoch": 2.0779220779220777, + "grad_norm": 0.37614893913269043, + "learning_rate": 0.00021406697290972404, + "loss": 0.5562425231933594, + "mean_token_accuracy": 0.8369060623645782, + "num_tokens": 2035968.0, + "step": 800 + }, + { + "entropy": 0.5720089420676231, + "epoch": 2.207792207792208, + "grad_norm": 0.35560500621795654, + "learning_rate": 0.00021211860917768236, + "loss": 0.521314697265625, + "mean_token_accuracy": 0.8453685063123703, + "num_tokens": 2168591.0, + "step": 850 + }, + { + "entropy": 0.5757493850588798, + "epoch": 2.3376623376623376, + "grad_norm": 0.304592490196228, + "learning_rate": 0.00020996225406798486, + "loss": 0.5258681106567383, + "mean_token_accuracy": 0.8431038129329681, + "num_tokens": 2297967.0, + "step": 900 + }, + { + "entropy": 0.5790414094924927, + "epoch": 2.4675324675324677, + "grad_norm": 0.29762616753578186, + "learning_rate": 0.00020760233835036664, + "loss": 0.5219763565063477, + "mean_token_accuracy": 0.8438457292318344, + "num_tokens": 2425614.0, + "step": 950 + }, + { + "entropy": 0.5783660891652107, + "epoch": 2.5974025974025974, + "grad_norm": 0.3418065011501312, + "learning_rate": 0.00020504371106063417, + "loss": 0.5258687210083007, + "mean_token_accuracy": 0.8422278153896332, + "num_tokens": 2557421.0, + "step": 1000 + }, + { + "entropy": 0.5746926316618919, + "epoch": 2.7272727272727275, + "grad_norm": 0.3708217740058899, + "learning_rate": 0.00020229162953711157, + "loss": 0.5161260223388672, + "mean_token_accuracy": 0.8453844922780991, + "num_tokens": 2680379.0, + "step": 1050 + }, + { + "entropy": 0.5676264691352845, + "epoch": 2.857142857142857, + "grad_norm": 0.26185691356658936, + "learning_rate": 0.00019935174861812654, + "loss": 0.5179851913452148, + "mean_token_accuracy": 0.8455151951313019, + "num_tokens": 2810854.0, + "step": 1100 + }, + { + "entropy": 0.5804974803328514, + "epoch": 2.987012987012987, + "grad_norm": 0.3131248652935028, + "learning_rate": 0.00019623010902273397, + "loss": 0.5243957138061524, + "mean_token_accuracy": 0.8455932134389877, + "num_tokens": 2932637.0, + "step": 1150 + }, + { + "epoch": 3.0, + "eval_entropy": 0.6151215266436338, + "eval_loss": 0.6557295918464661, + "eval_mean_token_accuracy": 0.8159411993737404, + "eval_num_tokens": 2944650.0, + "eval_runtime": 46.5152, + "eval_samples_per_second": 35.623, + "eval_steps_per_second": 4.472, + "step": 1155 + }, + { + "entropy": 0.5100037640333176, + "epoch": 3.116883116883117, + "grad_norm": 0.32568028569221497, + "learning_rate": 0.00019293312493855094, + "loss": 0.4522856140136719, + "mean_token_accuracy": 0.8600019490718842, + "num_tokens": 3054046.0, + "step": 1200 + }, + { + "entropy": 0.5058341425657272, + "epoch": 3.2467532467532467, + "grad_norm": 0.3810749650001526, + "learning_rate": 0.00018946757084220762, + "loss": 0.44891536712646485, + "mean_token_accuracy": 0.8606845206022262, + "num_tokens": 3182515.0, + "step": 1250 + }, + { + "entropy": 0.5006153827905655, + "epoch": 3.3766233766233764, + "grad_norm": 0.35473746061325073, + "learning_rate": 0.0001858405675794941, + "loss": 0.451065673828125, + "mean_token_accuracy": 0.8608392387628555, + "num_tokens": 3313672.0, + "step": 1300 + }, + { + "entropy": 0.508549126982689, + "epoch": 3.5064935064935066, + "grad_norm": 0.3414103388786316, + "learning_rate": 0.00018205956773380578, + "loss": 0.4535030746459961, + "mean_token_accuracy": 0.859148946404457, + "num_tokens": 3440689.0, + "step": 1350 + }, + { + "entropy": 0.516265479028225, + "epoch": 3.6363636363636362, + "grad_norm": 0.4597613513469696, + "learning_rate": 0.00017813234031295068, + "loss": 0.45882129669189453, + "mean_token_accuracy": 0.8582911169528962, + "num_tokens": 3566427.0, + "step": 1400 + }, + { + "entropy": 0.4902383416891098, + "epoch": 3.7662337662337664, + "grad_norm": 0.3385583460330963, + "learning_rate": 0.0001740669547857841, + "loss": 0.4417523193359375, + "mean_token_accuracy": 0.8632881045341492, + "num_tokens": 3699976.0, + "step": 1450 + }, + { + "entropy": 0.5104234129190445, + "epoch": 3.896103896103896, + "grad_norm": 0.3935626149177551, + "learning_rate": 0.00016987176450147088, + "loss": 0.4547672653198242, + "mean_token_accuracy": 0.8592299193143844, + "num_tokens": 3827926.0, + "step": 1500 + }, + { + "epoch": 4.0, + "eval_entropy": 0.565094108478381, + "eval_loss": 0.6673083305358887, + "eval_mean_token_accuracy": 0.8179781915476689, + "eval_num_tokens": 3926200.0, + "eval_runtime": 46.4987, + "eval_samples_per_second": 35.635, + "eval_steps_per_second": 4.473, + "step": 1540 + }, + { + "entropy": 0.49345644742250444, + "epoch": 4.025974025974026, + "grad_norm": 0.42027363181114197, + "learning_rate": 0.00016555538952544487, + "loss": 0.4355708312988281, + "mean_token_accuracy": 0.8648384511470795, + "num_tokens": 3953109.0, + "step": 1550 + }, + { + "entropy": 0.4247111546993256, + "epoch": 4.1558441558441555, + "grad_norm": 0.4478093087673187, + "learning_rate": 0.00016112669892733307, + "loss": 0.36362716674804685, + "mean_token_accuracy": 0.8844931781291961, + "num_tokens": 4081079.0, + "step": 1600 + }, + { + "entropy": 0.42337420970201495, + "epoch": 4.285714285714286, + "grad_norm": 0.35812026262283325, + "learning_rate": 0.00015659479255723875, + "loss": 0.3651982498168945, + "mean_token_accuracy": 0.8830066406726838, + "num_tokens": 4206426.0, + "step": 1650 + }, + { + "entropy": 0.43247521698474883, + "epoch": 4.415584415584416, + "grad_norm": 0.3665122985839844, + "learning_rate": 0.0001519689823478283, + "loss": 0.37368186950683596, + "mean_token_accuracy": 0.8800795775651932, + "num_tokens": 4329839.0, + "step": 1700 + }, + { + "entropy": 0.4328634282946587, + "epoch": 4.545454545454545, + "grad_norm": 0.33961260318756104, + "learning_rate": 0.00014725877318064152, + "loss": 0.37599964141845704, + "mean_token_accuracy": 0.8797144430875778, + "num_tokens": 4453940.0, + "step": 1750 + }, + { + "entropy": 0.42742862343788146, + "epoch": 4.675324675324675, + "grad_norm": 0.4312469959259033, + "learning_rate": 0.0001424738433559405, + "loss": 0.37354656219482424, + "mean_token_accuracy": 0.880826217532158, + "num_tokens": 4583626.0, + "step": 1800 + }, + { + "entropy": 0.4280877533555031, + "epoch": 4.805194805194805, + "grad_norm": 0.4343482255935669, + "learning_rate": 0.0001376240247062263, + "loss": 0.3688441467285156, + "mean_token_accuracy": 0.8814105206727981, + "num_tokens": 4713540.0, + "step": 1850 + }, + { + "entropy": 0.42952129155397417, + "epoch": 4.935064935064935, + "grad_norm": 0.47131651639938354, + "learning_rate": 0.00013271928239428512, + "loss": 0.37270416259765626, + "mean_token_accuracy": 0.8788579875230789, + "num_tokens": 4845678.0, + "step": 1900 + }, + { + "epoch": 5.0, + "eval_entropy": 0.5064233084424183, + "eval_loss": 0.689122200012207, + "eval_mean_token_accuracy": 0.818150080453891, + "eval_num_tokens": 4907750.0, + "eval_runtime": 46.5114, + "eval_samples_per_second": 35.626, + "eval_steps_per_second": 4.472, + "step": 1925 + }, + { + "entropy": 0.3798492255806923, + "epoch": 5.064935064935065, + "grad_norm": 0.414468377828598, + "learning_rate": 0.0001277696944372747, + "loss": 0.31735713958740236, + "mean_token_accuracy": 0.8974772602319717, + "num_tokens": 4975955.0, + "step": 1950 + }, + { + "entropy": 0.3367840954661369, + "epoch": 5.194805194805195, + "grad_norm": 0.4515029191970825, + "learning_rate": 0.00012278543099892257, + "loss": 0.272756290435791, + "mean_token_accuracy": 0.9084931749105454, + "num_tokens": 5104770.0, + "step": 2000 + }, + { + "entropy": 0.34766108095645903, + "epoch": 5.324675324675325, + "grad_norm": 0.4040578007698059, + "learning_rate": 0.00011777673349238672, + "loss": 0.2792487144470215, + "mean_token_accuracy": 0.9065623581409454, + "num_tokens": 5229052.0, + "step": 2050 + }, + { + "entropy": 0.3566804251074791, + "epoch": 5.454545454545454, + "grad_norm": 0.5059600472450256, + "learning_rate": 0.00011275389353671628, + "loss": 0.2896596145629883, + "mean_token_accuracy": 0.9045185309648514, + "num_tokens": 5357753.0, + "step": 2100 + }, + { + "entropy": 0.3462292793393135, + "epoch": 5.584415584415584, + "grad_norm": 0.4664144217967987, + "learning_rate": 0.00010772723181015153, + "loss": 0.27794593811035156, + "mean_token_accuracy": 0.9075321304798126, + "num_tokens": 5481550.0, + "step": 2150 + }, + { + "entropy": 0.3434346827864647, + "epoch": 5.714285714285714, + "grad_norm": 0.46017780900001526, + "learning_rate": 0.00010270707684371499, + "loss": 0.2783885383605957, + "mean_token_accuracy": 0.9063384455442428, + "num_tokens": 5609104.0, + "step": 2200 + }, + { + "entropy": 0.341832632124424, + "epoch": 5.8441558441558445, + "grad_norm": 0.429608017206192, + "learning_rate": 9.77037437986665e-05, + "loss": 0.2815263748168945, + "mean_token_accuracy": 0.9060239523649216, + "num_tokens": 5737347.0, + "step": 2250 + }, + { + "entropy": 0.3492265248298645, + "epoch": 5.974025974025974, + "grad_norm": 0.5019266605377197, + "learning_rate": 9.272751327143021e-05, + "loss": 0.2844840621948242, + "mean_token_accuracy": 0.9042869365215301, + "num_tokens": 5861872.0, + "step": 2300 + }, + { + "epoch": 6.0, + "eval_entropy": 0.4365639974578069, + "eval_loss": 0.7624168395996094, + "eval_mean_token_accuracy": 0.8165808867376584, + "eval_num_tokens": 5889300.0, + "eval_runtime": 46.5221, + "eval_samples_per_second": 35.617, + "eval_steps_per_second": 4.471, + "step": 2310 + }, + { + "entropy": 0.28290177062153815, + "epoch": 6.103896103896104, + "grad_norm": 0.5457249283790588, + "learning_rate": 8.77886101695435e-05, + "loss": 0.2029383087158203, + "mean_token_accuracy": 0.9317537224292756, + "num_tokens": 5990679.0, + "step": 2350 + }, + { + "entropy": 0.2539422053098679, + "epoch": 6.233766233766234, + "grad_norm": 1.7734259366989136, + "learning_rate": 8.289718270203239e-05, + "loss": 0.1847425079345703, + "mean_token_accuracy": 0.937881036400795, + "num_tokens": 6117918.0, + "step": 2400 + }, + { + "entropy": 0.2555230759084225, + "epoch": 6.363636363636363, + "grad_norm": 0.49776697158813477, + "learning_rate": 7.806328152738371e-05, + "loss": 0.18783441543579102, + "mean_token_accuracy": 0.936203356385231, + "num_tokens": 6248566.0, + "step": 2450 + }, + { + "entropy": 0.2529813493788242, + "epoch": 6.4935064935064934, + "grad_norm": 0.4968299865722656, + "learning_rate": 7.32968391019587e-05, + "loss": 0.18458471298217774, + "mean_token_accuracy": 0.9365487760305404, + "num_tokens": 6380529.0, + "step": 2500 + }, + { + "entropy": 0.26623836129903794, + "epoch": 6.623376623376624, + "grad_norm": 0.6177894473075867, + "learning_rate": 6.860764927128271e-05, + "loss": 0.19328956604003905, + "mean_token_accuracy": 0.9330078029632568, + "num_tokens": 6501669.0, + "step": 2550 + }, + { + "entropy": 0.2671951600909233, + "epoch": 6.753246753246753, + "grad_norm": 0.6792670488357544, + "learning_rate": 6.400534714614501e-05, + "loss": 0.19405254364013672, + "mean_token_accuracy": 0.9335101181268692, + "num_tokens": 6624404.0, + "step": 2600 + }, + { + "entropy": 0.2522234851121902, + "epoch": 6.883116883116883, + "grad_norm": 0.4798950254917145, + "learning_rate": 5.949938930485951e-05, + "loss": 0.1846565818786621, + "mean_token_accuracy": 0.9369161009788514, + "num_tokens": 6755532.0, + "step": 2650 + }, + { + "epoch": 7.0, + "eval_entropy": 0.3764066012719503, + "eval_loss": 0.8554975390434265, + "eval_mean_token_accuracy": 0.8132733049301001, + "eval_num_tokens": 6870850.0, + "eval_runtime": 46.525, + "eval_samples_per_second": 35.615, + "eval_steps_per_second": 4.471, + "step": 2695 + }, + { + "entropy": 0.24576591402292253, + "epoch": 7.012987012987013, + "grad_norm": 0.3926205039024353, + "learning_rate": 5.5099034362364085e-05, + "loss": 0.1780208969116211, + "mean_token_accuracy": 0.9398036235570908, + "num_tokens": 6884338.0, + "step": 2700 + }, + { + "entropy": 0.18769787922501563, + "epoch": 7.142857142857143, + "grad_norm": 0.5062244534492493, + "learning_rate": 5.0813323946085895e-05, + "loss": 0.115099458694458, + "mean_token_accuracy": 0.961904166340828, + "num_tokens": 7008981.0, + "step": 2750 + }, + { + "entropy": 0.1834849800169468, + "epoch": 7.2727272727272725, + "grad_norm": 0.4319317638874054, + "learning_rate": 4.665106411766087e-05, + "loss": 0.11364558219909668, + "mean_token_accuracy": 0.9627783286571503, + "num_tokens": 7135402.0, + "step": 2800 + }, + { + "entropy": 0.17929002813994885, + "epoch": 7.402597402597403, + "grad_norm": 0.4122151732444763, + "learning_rate": 4.2620807278682855e-05, + "loss": 0.11115352630615234, + "mean_token_accuracy": 0.9625132656097413, + "num_tokens": 7265920.0, + "step": 2850 + }, + { + "entropy": 0.18764832600951195, + "epoch": 7.532467532467533, + "grad_norm": 0.4621254801750183, + "learning_rate": 3.873083459765971e-05, + "loss": 0.11578564643859864, + "mean_token_accuracy": 0.9611306923627854, + "num_tokens": 7389633.0, + "step": 2900 + }, + { + "entropy": 0.18220983803272248, + "epoch": 7.662337662337662, + "grad_norm": 0.45269420742988586, + "learning_rate": 3.498913899428605e-05, + "loss": 0.11399910926818847, + "mean_token_accuracy": 0.961864430308342, + "num_tokens": 7518696.0, + "step": 2950 + }, + { + "entropy": 0.18482646018266677, + "epoch": 7.792207792207792, + "grad_norm": 0.4839811325073242, + "learning_rate": 3.1403408715994884e-05, + "loss": 0.11555064201354981, + "mean_token_accuracy": 0.9611130750179291, + "num_tokens": 7643525.0, + "step": 3000 + }, + { + "entropy": 0.18274843357503415, + "epoch": 7.922077922077922, + "grad_norm": 0.46752479672431946, + "learning_rate": 2.798101154053465e-05, + "loss": 0.11180784225463868, + "mean_token_accuracy": 0.9621868497133255, + "num_tokens": 7773494.0, + "step": 3050 + }, + { + "epoch": 8.0, + "eval_entropy": 0.32566706384890354, + "eval_loss": 0.9837347865104675, + "eval_mean_token_accuracy": 0.8104732755858165, + "eval_num_tokens": 7852400.0, + "eval_runtime": 46.5488, + "eval_samples_per_second": 35.597, + "eval_steps_per_second": 4.468, + "step": 3080 + }, + { + "entropy": 0.1633647498488426, + "epoch": 8.051948051948052, + "grad_norm": 0.4305579960346222, + "learning_rate": 2.472897963703081e-05, + "loss": 0.09586874961853027, + "mean_token_accuracy": 0.9682459622621536, + "num_tokens": 7903348.0, + "step": 3100 + }, + { + "entropy": 0.14545966424047946, + "epoch": 8.181818181818182, + "grad_norm": 0.3315879702568054, + "learning_rate": 2.1653995116639546e-05, + "loss": 0.07627681255340577, + "mean_token_accuracy": 0.9752496027946472, + "num_tokens": 8028778.0, + "step": 3150 + }, + { + "entropy": 0.14618608497083188, + "epoch": 8.311688311688311, + "grad_norm": 0.30432993173599243, + "learning_rate": 1.876237630248263e-05, + "loss": 0.07688333988189697, + "mean_token_accuracy": 0.9748979198932648, + "num_tokens": 8153699.0, + "step": 3200 + }, + { + "entropy": 0.14237778432667256, + "epoch": 8.441558441558442, + "grad_norm": 0.299809068441391, + "learning_rate": 1.606006474707584e-05, + "loss": 0.07612751007080078, + "mean_token_accuracy": 0.9753290069103241, + "num_tokens": 8281408.0, + "step": 3250 + }, + { + "entropy": 0.14949887059628963, + "epoch": 8.571428571428571, + "grad_norm": 0.4242253601551056, + "learning_rate": 1.355261302392631e-05, + "loss": 0.08111579895019531, + "mean_token_accuracy": 0.9737774491310119, + "num_tokens": 8400657.0, + "step": 3300 + }, + { + "entropy": 0.1369019091874361, + "epoch": 8.7012987012987, + "grad_norm": 0.2756560742855072, + "learning_rate": 1.1245173318384599e-05, + "loss": 0.07382246494293213, + "mean_token_accuracy": 0.9767089641094208, + "num_tokens": 8533080.0, + "step": 3350 + }, + { + "entropy": 0.13799229875206948, + "epoch": 8.831168831168831, + "grad_norm": 0.32391059398651123, + "learning_rate": 9.14248684119404e-06, + "loss": 0.07332521915435791, + "mean_token_accuracy": 0.975739398598671, + "num_tokens": 8664922.0, + "step": 3400 + }, + { + "entropy": 0.13693079218268395, + "epoch": 8.96103896103896, + "grad_norm": 0.3435397446155548, + "learning_rate": 7.248874086490063e-06, + "loss": 0.0737720012664795, + "mean_token_accuracy": 0.9756388676166534, + "num_tokens": 8797714.0, + "step": 3450 + }, + { + "epoch": 9.0, + "eval_entropy": 0.2884131854113478, + "eval_loss": 1.1368393898010254, + "eval_mean_token_accuracy": 0.8096811464772775, + "eval_num_tokens": 8833950.0, + "eval_runtime": 46.5492, + "eval_samples_per_second": 35.597, + "eval_steps_per_second": 4.468, + "step": 3465 + }, + { + "entropy": 0.13195306338369847, + "epoch": 9.090909090909092, + "grad_norm": 0.3066045641899109, + "learning_rate": 5.568225954266577e-06, + "loss": 0.06769096851348877, + "mean_token_accuracy": 0.9786338102817536, + "num_tokens": 8925069.0, + "step": 3500 + }, + { + "entropy": 0.1242513469606638, + "epoch": 9.220779220779221, + "grad_norm": 0.21904544532299042, + "learning_rate": 4.103995755551041e-06, + "loss": 0.061446948051452635, + "mean_token_accuracy": 0.9808032715320587, + "num_tokens": 9056953.0, + "step": 3550 + }, + { + "entropy": 0.1268651543557644, + "epoch": 9.35064935064935, + "grad_norm": 0.3049603998661041, + "learning_rate": 2.8591921167149736e-06, + "loss": 0.06346964359283447, + "mean_token_accuracy": 0.9791843616962432, + "num_tokens": 9186990.0, + "step": 3600 + }, + { + "entropy": 0.1311937213689089, + "epoch": 9.480519480519481, + "grad_norm": 0.2234215885400772, + "learning_rate": 1.8363727975003678e-06, + "loss": 0.06572246074676513, + "mean_token_accuracy": 0.9789830875396729, + "num_tokens": 9311897.0, + "step": 3650 + }, + { + "entropy": 0.13369733810424805, + "epoch": 9.61038961038961, + "grad_norm": 0.2675652503967285, + "learning_rate": 1.0376394354638012e-06, + "loss": 0.0670329761505127, + "mean_token_accuracy": 0.9782129484415054, + "num_tokens": 9434396.0, + "step": 3700 + }, + { + "entropy": 0.13093947909772397, + "epoch": 9.74025974025974, + "grad_norm": 0.24558641016483307, + "learning_rate": 4.646332276376641e-07, + "loss": 0.06605084896087647, + "mean_token_accuracy": 0.9788406610488891, + "num_tokens": 9560766.0, + "step": 3750 + }, + { + "entropy": 0.13027613274753094, + "epoch": 9.87012987012987, + "grad_norm": 0.26439452171325684, + "learning_rate": 1.1853155828124577e-07, + "loss": 0.06636544704437256, + "mean_token_accuracy": 0.9784610909223557, + "num_tokens": 9686573.0, + "step": 3800 + }, + { + "entropy": 0.12754634492099284, + "epoch": 10.0, + "grad_norm": 0.3149493932723999, + "learning_rate": 4.5579650927833065e-11, + "loss": 0.06343324661254883, + "mean_token_accuracy": 0.9797626113891602, + "num_tokens": 9815500.0, + "step": 3850 + }, + { + "epoch": 10.0, + "eval_entropy": 0.27749587402034265, + "eval_loss": 1.2116529941558838, + "eval_mean_token_accuracy": 0.8092864940945919, + "eval_num_tokens": 9815500.0, + "eval_runtime": 46.5276, + "eval_samples_per_second": 35.613, + "eval_steps_per_second": 4.47, + "step": 3850 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.436930629170852e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e4d42893a69e354223672a3b0f9de6f2b979c004 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05284766149829996, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..96b3397fd4527850dd5d13eff943a1847c023137 --- /dev/null +++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/trainer_state.json @@ -0,0 +1,206 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 770, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.8529596650600433, + "epoch": 0.12987012987012986, + "grad_norm": 0.878183901309967, + "learning_rate": 2.8227544379109735e-05, + "loss": 1.762685546875, + "mean_token_accuracy": 0.6396001759171486, + "num_tokens": 133878.0, + "step": 50 + }, + { + "entropy": 0.9309259909391403, + "epoch": 0.2597402597402597, + "grad_norm": 0.6279756426811218, + "learning_rate": 5.7031161092487016e-05, + "loss": 0.8922532653808594, + "mean_token_accuracy": 0.768746777176857, + "num_tokens": 265533.0, + "step": 100 + }, + { + "entropy": 0.8268384379148483, + "epoch": 0.38961038961038963, + "grad_norm": 0.5478948354721069, + "learning_rate": 8.583477780586432e-05, + "loss": 0.7756452178955078, + "mean_token_accuracy": 0.7925205600261688, + "num_tokens": 386480.0, + "step": 150 + }, + { + "entropy": 0.7658095824718475, + "epoch": 0.5194805194805194, + "grad_norm": 0.5972853899002075, + "learning_rate": 0.00011463839451924158, + "loss": 0.7161135101318359, + "mean_token_accuracy": 0.8019153982400894, + "num_tokens": 520135.0, + "step": 200 + }, + { + "entropy": 0.769344270825386, + "epoch": 0.6493506493506493, + "grad_norm": 0.5576531887054443, + "learning_rate": 0.0001434420112326189, + "loss": 0.716611557006836, + "mean_token_accuracy": 0.8028716742992401, + "num_tokens": 640303.0, + "step": 250 + }, + { + "entropy": 0.7470700722932816, + "epoch": 0.7792207792207793, + "grad_norm": 0.3937978148460388, + "learning_rate": 0.0001722456279459962, + "loss": 0.6983388519287109, + "mean_token_accuracy": 0.80596988260746, + "num_tokens": 766027.0, + "step": 300 + }, + { + "entropy": 0.7172233510017395, + "epoch": 0.9090909090909091, + "grad_norm": 0.32564300298690796, + "learning_rate": 0.00020104924465937345, + "loss": 0.6731016540527344, + "mean_token_accuracy": 0.8119878542423248, + "num_tokens": 894871.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.7397788639825124, + "eval_loss": 0.7304292917251587, + "eval_mean_token_accuracy": 0.7976073359067624, + "eval_num_tokens": 981550.0, + "eval_runtime": 46.5688, + "eval_samples_per_second": 35.582, + "eval_steps_per_second": 4.467, + "step": 385 + }, + { + "entropy": 0.7035581320524216, + "epoch": 1.0389610389610389, + "grad_norm": 0.45747122168540955, + "learning_rate": 0.00022177891520076015, + "loss": 0.6450813293457032, + "mean_token_accuracy": 0.8178210550546646, + "num_tokens": 1021041.0, + "step": 400 + }, + { + "entropy": 0.6679977881908417, + "epoch": 1.1688311688311688, + "grad_norm": 0.4456019401550293, + "learning_rate": 0.0002216012068086725, + "loss": 0.6189227294921875, + "mean_token_accuracy": 0.8257622331380844, + "num_tokens": 1149161.0, + "step": 450 + }, + { + "entropy": 0.6574469250440598, + "epoch": 1.2987012987012987, + "grad_norm": 0.3784359097480774, + "learning_rate": 0.00022119602267552194, + "loss": 0.6098381805419922, + "mean_token_accuracy": 0.8251238936185836, + "num_tokens": 1278079.0, + "step": 500 + }, + { + "entropy": 0.6593573099374771, + "epoch": 1.4285714285714286, + "grad_norm": 0.36508747935295105, + "learning_rate": 0.00022056419535323196, + "loss": 0.6137563705444335, + "mean_token_accuracy": 0.824503253698349, + "num_tokens": 1405326.0, + "step": 550 + }, + { + "entropy": 0.6459340593218803, + "epoch": 1.5584415584415585, + "grad_norm": 0.39773380756378174, + "learning_rate": 0.00021970702308872148, + "loss": 0.5986767196655274, + "mean_token_accuracy": 0.8270765203237533, + "num_tokens": 1532507.0, + "step": 600 + }, + { + "entropy": 0.633097175359726, + "epoch": 1.6883116883116882, + "grad_norm": 0.41731834411621094, + "learning_rate": 0.00021862626715633265, + "loss": 0.5849922180175782, + "mean_token_accuracy": 0.8313153618574143, + "num_tokens": 1662747.0, + "step": 650 + }, + { + "entropy": 0.6569280475378036, + "epoch": 1.8181818181818183, + "grad_norm": 0.3067665100097656, + "learning_rate": 0.00021732414823885307, + "loss": 0.605062370300293, + "mean_token_accuracy": 0.8269566106796264, + "num_tokens": 1783817.0, + "step": 700 + }, + { + "entropy": 0.6459099870920181, + "epoch": 1.948051948051948, + "grad_norm": 0.5508913397789001, + "learning_rate": 0.00021580334186456886, + "loss": 0.596672134399414, + "mean_token_accuracy": 0.8287447422742844, + "num_tokens": 1914573.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.6985598478752834, + "eval_loss": 0.6716415286064148, + "eval_mean_token_accuracy": 0.808078097895934, + "eval_num_tokens": 1963100.0, + "eval_runtime": 46.5197, + "eval_samples_per_second": 35.619, + "eval_steps_per_second": 4.471, + "step": 770 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0934953976464589e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20bdc3e560a9ea3898b4420cd708cf43b0d087bb --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/README.md @@ -0,0 +1,58 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: transformers +model_name: Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1 +tags: +- generated_from_trainer +- trl +- sft +licence: license +--- + +# Model Card for Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1 + +This model is a fine-tuned version of [Qwen/Qwen3-4B-Base](https://huggingface.co/Qwen/Qwen3-4B-Base). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/katriin-kukk/Cross_lingual_morphological_generalization/runs/zi0h7g34) + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.29.0 +- Transformers: 5.5.4 +- Pytorch: 2.10.0 +- Datasets: 4.6.1 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..62108c4c8ad6a925598383a7b5a5345aa9b946e6 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0652985372477836, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj", + "up_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a8f9e7bf36688d4fb23482908e519b39dfbb8d22 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/trainer_state.json @@ -0,0 +1,287 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1122, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.8254910862445832, + "epoch": 0.13386880856760375, + "grad_norm": 0.8958511352539062, + "learning_rate": 2.7446846603309888e-05, + "loss": 1.722928009033203, + "mean_token_accuracy": 0.6557231456041336, + "num_tokens": 122643.0, + "step": 50 + }, + { + "entropy": 0.8272530055046081, + "epoch": 0.2677376171352075, + "grad_norm": 0.6162250638008118, + "learning_rate": 5.5453832933217935e-05, + "loss": 0.7761137390136719, + "mean_token_accuracy": 0.795179500579834, + "num_tokens": 245702.0, + "step": 100 + }, + { + "entropy": 0.6629777508974075, + "epoch": 0.40160642570281124, + "grad_norm": 0.43394413590431213, + "learning_rate": 8.346081926312598e-05, + "loss": 0.6302793884277343, + "mean_token_accuracy": 0.8262274640798569, + "num_tokens": 371834.0, + "step": 150 + }, + { + "entropy": 0.6184763962030411, + "epoch": 0.535475234270415, + "grad_norm": 0.4373406767845154, + "learning_rate": 0.00011146780559303404, + "loss": 0.5863075256347656, + "mean_token_accuracy": 0.8357332807779312, + "num_tokens": 500948.0, + "step": 200 + }, + { + "entropy": 0.6116772794723511, + "epoch": 0.6693440428380187, + "grad_norm": 0.6781982183456421, + "learning_rate": 0.00013947479192294207, + "loss": 0.5783074951171875, + "mean_token_accuracy": 0.8368694090843201, + "num_tokens": 621874.0, + "step": 250 + }, + { + "entropy": 0.5848374783992767, + "epoch": 0.8032128514056225, + "grad_norm": 0.3493351340293884, + "learning_rate": 0.00016748177825285014, + "loss": 0.5531037902832031, + "mean_token_accuracy": 0.8434528934955597, + "num_tokens": 746138.0, + "step": 300 + }, + { + "entropy": 0.57043960750103, + "epoch": 0.9370816599732262, + "grad_norm": 0.3953551650047302, + "learning_rate": 0.00019548876458275817, + "loss": 0.5387083053588867, + "mean_token_accuracy": 0.844996885061264, + "num_tokens": 868331.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.6065685012936592, + "eval_loss": 0.5825985074043274, + "eval_mean_token_accuracy": 0.8326057174801826, + "eval_num_tokens": 920437.0, + "eval_runtime": 49.017, + "eval_samples_per_second": 32.621, + "eval_steps_per_second": 4.08, + "step": 374 + }, + { + "entropy": 0.5623676343397661, + "epoch": 1.069611780455154, + "grad_norm": 0.37431156635284424, + "learning_rate": 0.00020946374495076317, + "loss": 0.526231803894043, + "mean_token_accuracy": 0.8500469212580208, + "num_tokens": 980838.0, + "step": 400 + }, + { + "entropy": 0.5420066699385643, + "epoch": 1.2034805890227578, + "grad_norm": 0.3302382826805115, + "learning_rate": 0.00020923573570386192, + "loss": 0.5088261032104492, + "mean_token_accuracy": 0.8518517130613327, + "num_tokens": 1104763.0, + "step": 450 + }, + { + "entropy": 0.5347033357620239, + "epoch": 1.3373493975903614, + "grad_norm": 0.281392902135849, + "learning_rate": 0.00020878021367110025, + "loss": 0.5044374084472656, + "mean_token_accuracy": 0.8548643559217453, + "num_tokens": 1230459.0, + "step": 500 + }, + { + "entropy": 0.5319472518563271, + "epoch": 1.4712182061579653, + "grad_norm": 0.3298404812812805, + "learning_rate": 0.00020809817069357935, + "loss": 0.5011252593994141, + "mean_token_accuracy": 0.8546603417396545, + "num_tokens": 1356368.0, + "step": 550 + }, + { + "entropy": 0.5234624195098877, + "epoch": 1.605087014725569, + "grad_norm": 0.26496022939682007, + "learning_rate": 0.00020719109183285305, + "loss": 0.49288436889648435, + "mean_token_accuracy": 0.8559961414337158, + "num_tokens": 1484569.0, + "step": 600 + }, + { + "entropy": 0.5170091751217842, + "epoch": 1.7389558232931726, + "grad_norm": 0.2679271996021271, + "learning_rate": 0.00020606095213739626, + "loss": 0.4867116165161133, + "mean_token_accuracy": 0.8584025889635086, + "num_tokens": 1609962.0, + "step": 650 + }, + { + "entropy": 0.519580851495266, + "epoch": 1.8728246318607764, + "grad_norm": 0.2783832848072052, + "learning_rate": 0.0002047102123421885, + "loss": 0.4858899688720703, + "mean_token_accuracy": 0.8601382756233216, + "num_tokens": 1729667.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.5516054129600525, + "eval_loss": 0.5468233227729797, + "eval_mean_token_accuracy": 0.8418151989579201, + "eval_num_tokens": 1840874.0, + "eval_runtime": 49.0401, + "eval_samples_per_second": 32.606, + "eval_steps_per_second": 4.078, + "step": 748 + }, + { + "entropy": 0.5179645954960524, + "epoch": 2.005354752342704, + "grad_norm": 0.39848020672798157, + "learning_rate": 0.00020314181351077757, + "loss": 0.4828613662719727, + "mean_token_accuracy": 0.8594950991447525, + "num_tokens": 1846410.0, + "step": 750 + }, + { + "entropy": 0.4724735128879547, + "epoch": 2.139223560910308, + "grad_norm": 0.27450788021087646, + "learning_rate": 0.00020135917063148916, + "loss": 0.43275066375732424, + "mean_token_accuracy": 0.8704854369163513, + "num_tokens": 1971547.0, + "step": 800 + }, + { + "entropy": 0.4807534040510654, + "epoch": 2.2730923694779115, + "grad_norm": 0.3146534562110901, + "learning_rate": 0.00019936616518172531, + "loss": 0.44326435089111327, + "mean_token_accuracy": 0.8670724099874496, + "num_tokens": 2088560.0, + "step": 850 + }, + { + "entropy": 0.4744683504104614, + "epoch": 2.4069611780455156, + "grad_norm": 0.3151724338531494, + "learning_rate": 0.0001971671366765428, + "loss": 0.44036914825439455, + "mean_token_accuracy": 0.8687405550479889, + "num_tokens": 2210146.0, + "step": 900 + }, + { + "entropy": 0.4728820985555649, + "epoch": 2.540829986613119, + "grad_norm": 0.30318814516067505, + "learning_rate": 0.00019476687321991266, + "loss": 0.43707496643066407, + "mean_token_accuracy": 0.8694235664606095, + "num_tokens": 2331354.0, + "step": 950 + }, + { + "entropy": 0.4706392896175384, + "epoch": 2.674698795180723, + "grad_norm": 0.32992425560951233, + "learning_rate": 0.00019217060107923494, + "loss": 0.4379159545898437, + "mean_token_accuracy": 0.8695945852994919, + "num_tokens": 2452558.0, + "step": 1000 + }, + { + "entropy": 0.46944593608379365, + "epoch": 2.8085676037483265, + "grad_norm": 0.23659372329711914, + "learning_rate": 0.0001893839733058082, + "loss": 0.43395462036132815, + "mean_token_accuracy": 0.8707112389802932, + "num_tokens": 2582428.0, + "step": 1050 + }, + { + "entropy": 0.47586817651987073, + "epoch": 2.9424364123159306, + "grad_norm": 0.25799325108528137, + "learning_rate": 0.00018641305742603172, + "loss": 0.44020862579345704, + "mean_token_accuracy": 0.869452143907547, + "num_tokens": 2707532.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.5092438699305057, + "eval_loss": 0.5377861857414246, + "eval_mean_token_accuracy": 0.8451382353901863, + "eval_num_tokens": 2761311.0, + "eval_runtime": 49.0653, + "eval_samples_per_second": 32.589, + "eval_steps_per_second": 4.076, + "step": 1122 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.451742995129948e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..62108c4c8ad6a925598383a7b5a5345aa9b946e6 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0652985372477836, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj", + "up_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b39d18fb7d2a26a456968dd6b90f8508457b8d69 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/trainer_state.json @@ -0,0 +1,368 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 1496, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.8254910862445832, + "epoch": 0.13386880856760375, + "grad_norm": 0.8958511352539062, + "learning_rate": 2.7446846603309888e-05, + "loss": 1.722928009033203, + "mean_token_accuracy": 0.6557231456041336, + "num_tokens": 122643.0, + "step": 50 + }, + { + "entropy": 0.8272530055046081, + "epoch": 0.2677376171352075, + "grad_norm": 0.6162250638008118, + "learning_rate": 5.5453832933217935e-05, + "loss": 0.7761137390136719, + "mean_token_accuracy": 0.795179500579834, + "num_tokens": 245702.0, + "step": 100 + }, + { + "entropy": 0.6629777508974075, + "epoch": 0.40160642570281124, + "grad_norm": 0.43394413590431213, + "learning_rate": 8.346081926312598e-05, + "loss": 0.6302793884277343, + "mean_token_accuracy": 0.8262274640798569, + "num_tokens": 371834.0, + "step": 150 + }, + { + "entropy": 0.6184763962030411, + "epoch": 0.535475234270415, + "grad_norm": 0.4373406767845154, + "learning_rate": 0.00011146780559303404, + "loss": 0.5863075256347656, + "mean_token_accuracy": 0.8357332807779312, + "num_tokens": 500948.0, + "step": 200 + }, + { + "entropy": 0.6116772794723511, + "epoch": 0.6693440428380187, + "grad_norm": 0.6781982183456421, + "learning_rate": 0.00013947479192294207, + "loss": 0.5783074951171875, + "mean_token_accuracy": 0.8368694090843201, + "num_tokens": 621874.0, + "step": 250 + }, + { + "entropy": 0.5848374783992767, + "epoch": 0.8032128514056225, + "grad_norm": 0.3493351340293884, + "learning_rate": 0.00016748177825285014, + "loss": 0.5531037902832031, + "mean_token_accuracy": 0.8434528934955597, + "num_tokens": 746138.0, + "step": 300 + }, + { + "entropy": 0.57043960750103, + "epoch": 0.9370816599732262, + "grad_norm": 0.3953551650047302, + "learning_rate": 0.00019548876458275817, + "loss": 0.5387083053588867, + "mean_token_accuracy": 0.844996885061264, + "num_tokens": 868331.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.6065685012936592, + "eval_loss": 0.5825985074043274, + "eval_mean_token_accuracy": 0.8326057174801826, + "eval_num_tokens": 920437.0, + "eval_runtime": 49.017, + "eval_samples_per_second": 32.621, + "eval_steps_per_second": 4.08, + "step": 374 + }, + { + "entropy": 0.5623676343397661, + "epoch": 1.069611780455154, + "grad_norm": 0.37431156635284424, + "learning_rate": 0.00020946374495076317, + "loss": 0.526231803894043, + "mean_token_accuracy": 0.8500469212580208, + "num_tokens": 980838.0, + "step": 400 + }, + { + "entropy": 0.5420066699385643, + "epoch": 1.2034805890227578, + "grad_norm": 0.3302382826805115, + "learning_rate": 0.00020923573570386192, + "loss": 0.5088261032104492, + "mean_token_accuracy": 0.8518517130613327, + "num_tokens": 1104763.0, + "step": 450 + }, + { + "entropy": 0.5347033357620239, + "epoch": 1.3373493975903614, + "grad_norm": 0.281392902135849, + "learning_rate": 0.00020878021367110025, + "loss": 0.5044374084472656, + "mean_token_accuracy": 0.8548643559217453, + "num_tokens": 1230459.0, + "step": 500 + }, + { + "entropy": 0.5319472518563271, + "epoch": 1.4712182061579653, + "grad_norm": 0.3298404812812805, + "learning_rate": 0.00020809817069357935, + "loss": 0.5011252593994141, + "mean_token_accuracy": 0.8546603417396545, + "num_tokens": 1356368.0, + "step": 550 + }, + { + "entropy": 0.5234624195098877, + "epoch": 1.605087014725569, + "grad_norm": 0.26496022939682007, + "learning_rate": 0.00020719109183285305, + "loss": 0.49288436889648435, + "mean_token_accuracy": 0.8559961414337158, + "num_tokens": 1484569.0, + "step": 600 + }, + { + "entropy": 0.5170091751217842, + "epoch": 1.7389558232931726, + "grad_norm": 0.2679271996021271, + "learning_rate": 0.00020606095213739626, + "loss": 0.4867116165161133, + "mean_token_accuracy": 0.8584025889635086, + "num_tokens": 1609962.0, + "step": 650 + }, + { + "entropy": 0.519580851495266, + "epoch": 1.8728246318607764, + "grad_norm": 0.2783832848072052, + "learning_rate": 0.0002047102123421885, + "loss": 0.4858899688720703, + "mean_token_accuracy": 0.8601382756233216, + "num_tokens": 1729667.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.5516054129600525, + "eval_loss": 0.5468233227729797, + "eval_mean_token_accuracy": 0.8418151989579201, + "eval_num_tokens": 1840874.0, + "eval_runtime": 49.0401, + "eval_samples_per_second": 32.606, + "eval_steps_per_second": 4.078, + "step": 748 + }, + { + "entropy": 0.5179645954960524, + "epoch": 2.005354752342704, + "grad_norm": 0.39848020672798157, + "learning_rate": 0.00020314181351077757, + "loss": 0.4828613662719727, + "mean_token_accuracy": 0.8594950991447525, + "num_tokens": 1846410.0, + "step": 750 + }, + { + "entropy": 0.4724735128879547, + "epoch": 2.139223560910308, + "grad_norm": 0.27450788021087646, + "learning_rate": 0.00020135917063148916, + "loss": 0.43275066375732424, + "mean_token_accuracy": 0.8704854369163513, + "num_tokens": 1971547.0, + "step": 800 + }, + { + "entropy": 0.4807534040510654, + "epoch": 2.2730923694779115, + "grad_norm": 0.3146534562110901, + "learning_rate": 0.00019936616518172531, + "loss": 0.44326435089111327, + "mean_token_accuracy": 0.8670724099874496, + "num_tokens": 2088560.0, + "step": 850 + }, + { + "entropy": 0.4744683504104614, + "epoch": 2.4069611780455156, + "grad_norm": 0.3151724338531494, + "learning_rate": 0.0001971671366765428, + "loss": 0.44036914825439455, + "mean_token_accuracy": 0.8687405550479889, + "num_tokens": 2210146.0, + "step": 900 + }, + { + "entropy": 0.4728820985555649, + "epoch": 2.540829986613119, + "grad_norm": 0.30318814516067505, + "learning_rate": 0.00019476687321991266, + "loss": 0.43707496643066407, + "mean_token_accuracy": 0.8694235664606095, + "num_tokens": 2331354.0, + "step": 950 + }, + { + "entropy": 0.4706392896175384, + "epoch": 2.674698795180723, + "grad_norm": 0.32992425560951233, + "learning_rate": 0.00019217060107923494, + "loss": 0.4379159545898437, + "mean_token_accuracy": 0.8695945852994919, + "num_tokens": 2452558.0, + "step": 1000 + }, + { + "entropy": 0.46944593608379365, + "epoch": 2.8085676037483265, + "grad_norm": 0.23659372329711914, + "learning_rate": 0.0001893839733058082, + "loss": 0.43395462036132815, + "mean_token_accuracy": 0.8707112389802932, + "num_tokens": 2582428.0, + "step": 1050 + }, + { + "entropy": 0.47586817651987073, + "epoch": 2.9424364123159306, + "grad_norm": 0.25799325108528137, + "learning_rate": 0.00018641305742603172, + "loss": 0.44020862579345704, + "mean_token_accuracy": 0.869452143907547, + "num_tokens": 2707532.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.5092438699305057, + "eval_loss": 0.5377861857414246, + "eval_mean_token_accuracy": 0.8451382353901863, + "eval_num_tokens": 2761311.0, + "eval_runtime": 49.0653, + "eval_samples_per_second": 32.589, + "eval_steps_per_second": 4.076, + "step": 1122 + }, + { + "entropy": 0.44004572521556506, + "epoch": 3.074966532797858, + "grad_norm": 0.31287693977355957, + "learning_rate": 0.0001832643222301409, + "loss": 0.39778636932373046, + "mean_token_accuracy": 0.8794932232962714, + "num_tokens": 2834344.0, + "step": 1150 + }, + { + "entropy": 0.42112498462200165, + "epoch": 3.208835341365462, + "grad_norm": 0.3411082327365875, + "learning_rate": 0.000179944623687242, + "loss": 0.3773982620239258, + "mean_token_accuracy": 0.8827895969152451, + "num_tokens": 2952748.0, + "step": 1200 + }, + { + "entropy": 0.41285495966672897, + "epoch": 3.3427041499330654, + "grad_norm": 0.31819090247154236, + "learning_rate": 0.0001764611900173143, + "loss": 0.3741728210449219, + "mean_token_accuracy": 0.8846501314640045, + "num_tokens": 3080333.0, + "step": 1250 + }, + { + "entropy": 0.42110098838806154, + "epoch": 3.4765729585006695, + "grad_norm": 0.3009640872478485, + "learning_rate": 0.00017282160595268327, + "loss": 0.3816569900512695, + "mean_token_accuracy": 0.8814844501018524, + "num_tokens": 3206920.0, + "step": 1300 + }, + { + "entropy": 0.4269171151518822, + "epoch": 3.610441767068273, + "grad_norm": 0.3651696741580963, + "learning_rate": 0.00016903379622323396, + "loss": 0.38763641357421874, + "mean_token_accuracy": 0.8813142603635788, + "num_tokens": 3327931.0, + "step": 1350 + }, + { + "entropy": 0.4262796178460121, + "epoch": 3.7443105756358768, + "grad_norm": 0.28190597891807556, + "learning_rate": 0.00016510600830132272, + "loss": 0.38563640594482423, + "mean_token_accuracy": 0.8813980734348297, + "num_tokens": 3443874.0, + "step": 1400 + }, + { + "entropy": 0.4200029063224793, + "epoch": 3.878179384203481, + "grad_norm": 0.2584955096244812, + "learning_rate": 0.00016104679444395854, + "loss": 0.3829658508300781, + "mean_token_accuracy": 0.8822885012626648, + "num_tokens": 3573158.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.47917599841952324, + "eval_loss": 0.5421923398971558, + "eval_mean_token_accuracy": 0.8468858799338341, + "eval_num_tokens": 3681748.0, + "eval_runtime": 49.0466, + "eval_samples_per_second": 32.602, + "eval_steps_per_second": 4.078, + "step": 1496 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.9374623481280205e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..62108c4c8ad6a925598383a7b5a5345aa9b946e6 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0652985372477836, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj", + "up_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a209f85416c7076dcb08e85e0ec91fe1093a3ded --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/trainer_state.json @@ -0,0 +1,459 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 1870, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.8254910862445832, + "epoch": 0.13386880856760375, + "grad_norm": 0.8958511352539062, + "learning_rate": 2.7446846603309888e-05, + "loss": 1.722928009033203, + "mean_token_accuracy": 0.6557231456041336, + "num_tokens": 122643.0, + "step": 50 + }, + { + "entropy": 0.8272530055046081, + "epoch": 0.2677376171352075, + "grad_norm": 0.6162250638008118, + "learning_rate": 5.5453832933217935e-05, + "loss": 0.7761137390136719, + "mean_token_accuracy": 0.795179500579834, + "num_tokens": 245702.0, + "step": 100 + }, + { + "entropy": 0.6629777508974075, + "epoch": 0.40160642570281124, + "grad_norm": 0.43394413590431213, + "learning_rate": 8.346081926312598e-05, + "loss": 0.6302793884277343, + "mean_token_accuracy": 0.8262274640798569, + "num_tokens": 371834.0, + "step": 150 + }, + { + "entropy": 0.6184763962030411, + "epoch": 0.535475234270415, + "grad_norm": 0.4373406767845154, + "learning_rate": 0.00011146780559303404, + "loss": 0.5863075256347656, + "mean_token_accuracy": 0.8357332807779312, + "num_tokens": 500948.0, + "step": 200 + }, + { + "entropy": 0.6116772794723511, + "epoch": 0.6693440428380187, + "grad_norm": 0.6781982183456421, + "learning_rate": 0.00013947479192294207, + "loss": 0.5783074951171875, + "mean_token_accuracy": 0.8368694090843201, + "num_tokens": 621874.0, + "step": 250 + }, + { + "entropy": 0.5848374783992767, + "epoch": 0.8032128514056225, + "grad_norm": 0.3493351340293884, + "learning_rate": 0.00016748177825285014, + "loss": 0.5531037902832031, + "mean_token_accuracy": 0.8434528934955597, + "num_tokens": 746138.0, + "step": 300 + }, + { + "entropy": 0.57043960750103, + "epoch": 0.9370816599732262, + "grad_norm": 0.3953551650047302, + "learning_rate": 0.00019548876458275817, + "loss": 0.5387083053588867, + "mean_token_accuracy": 0.844996885061264, + "num_tokens": 868331.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.6065685012936592, + "eval_loss": 0.5825985074043274, + "eval_mean_token_accuracy": 0.8326057174801826, + "eval_num_tokens": 920437.0, + "eval_runtime": 49.017, + "eval_samples_per_second": 32.621, + "eval_steps_per_second": 4.08, + "step": 374 + }, + { + "entropy": 0.5623676343397661, + "epoch": 1.069611780455154, + "grad_norm": 0.37431156635284424, + "learning_rate": 0.00020946374495076317, + "loss": 0.526231803894043, + "mean_token_accuracy": 0.8500469212580208, + "num_tokens": 980838.0, + "step": 400 + }, + { + "entropy": 0.5420066699385643, + "epoch": 1.2034805890227578, + "grad_norm": 0.3302382826805115, + "learning_rate": 0.00020923573570386192, + "loss": 0.5088261032104492, + "mean_token_accuracy": 0.8518517130613327, + "num_tokens": 1104763.0, + "step": 450 + }, + { + "entropy": 0.5347033357620239, + "epoch": 1.3373493975903614, + "grad_norm": 0.281392902135849, + "learning_rate": 0.00020878021367110025, + "loss": 0.5044374084472656, + "mean_token_accuracy": 0.8548643559217453, + "num_tokens": 1230459.0, + "step": 500 + }, + { + "entropy": 0.5319472518563271, + "epoch": 1.4712182061579653, + "grad_norm": 0.3298404812812805, + "learning_rate": 0.00020809817069357935, + "loss": 0.5011252593994141, + "mean_token_accuracy": 0.8546603417396545, + "num_tokens": 1356368.0, + "step": 550 + }, + { + "entropy": 0.5234624195098877, + "epoch": 1.605087014725569, + "grad_norm": 0.26496022939682007, + "learning_rate": 0.00020719109183285305, + "loss": 0.49288436889648435, + "mean_token_accuracy": 0.8559961414337158, + "num_tokens": 1484569.0, + "step": 600 + }, + { + "entropy": 0.5170091751217842, + "epoch": 1.7389558232931726, + "grad_norm": 0.2679271996021271, + "learning_rate": 0.00020606095213739626, + "loss": 0.4867116165161133, + "mean_token_accuracy": 0.8584025889635086, + "num_tokens": 1609962.0, + "step": 650 + }, + { + "entropy": 0.519580851495266, + "epoch": 1.8728246318607764, + "grad_norm": 0.2783832848072052, + "learning_rate": 0.0002047102123421885, + "loss": 0.4858899688720703, + "mean_token_accuracy": 0.8601382756233216, + "num_tokens": 1729667.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.5516054129600525, + "eval_loss": 0.5468233227729797, + "eval_mean_token_accuracy": 0.8418151989579201, + "eval_num_tokens": 1840874.0, + "eval_runtime": 49.0401, + "eval_samples_per_second": 32.606, + "eval_steps_per_second": 4.078, + "step": 748 + }, + { + "entropy": 0.5179645954960524, + "epoch": 2.005354752342704, + "grad_norm": 0.39848020672798157, + "learning_rate": 0.00020314181351077757, + "loss": 0.4828613662719727, + "mean_token_accuracy": 0.8594950991447525, + "num_tokens": 1846410.0, + "step": 750 + }, + { + "entropy": 0.4724735128879547, + "epoch": 2.139223560910308, + "grad_norm": 0.27450788021087646, + "learning_rate": 0.00020135917063148916, + "loss": 0.43275066375732424, + "mean_token_accuracy": 0.8704854369163513, + "num_tokens": 1971547.0, + "step": 800 + }, + { + "entropy": 0.4807534040510654, + "epoch": 2.2730923694779115, + "grad_norm": 0.3146534562110901, + "learning_rate": 0.00019936616518172531, + "loss": 0.44326435089111327, + "mean_token_accuracy": 0.8670724099874496, + "num_tokens": 2088560.0, + "step": 850 + }, + { + "entropy": 0.4744683504104614, + "epoch": 2.4069611780455156, + "grad_norm": 0.3151724338531494, + "learning_rate": 0.0001971671366765428, + "loss": 0.44036914825439455, + "mean_token_accuracy": 0.8687405550479889, + "num_tokens": 2210146.0, + "step": 900 + }, + { + "entropy": 0.4728820985555649, + "epoch": 2.540829986613119, + "grad_norm": 0.30318814516067505, + "learning_rate": 0.00019476687321991266, + "loss": 0.43707496643066407, + "mean_token_accuracy": 0.8694235664606095, + "num_tokens": 2331354.0, + "step": 950 + }, + { + "entropy": 0.4706392896175384, + "epoch": 2.674698795180723, + "grad_norm": 0.32992425560951233, + "learning_rate": 0.00019217060107923494, + "loss": 0.4379159545898437, + "mean_token_accuracy": 0.8695945852994919, + "num_tokens": 2452558.0, + "step": 1000 + }, + { + "entropy": 0.46944593608379365, + "epoch": 2.8085676037483265, + "grad_norm": 0.23659372329711914, + "learning_rate": 0.0001893839733058082, + "loss": 0.43395462036132815, + "mean_token_accuracy": 0.8707112389802932, + "num_tokens": 2582428.0, + "step": 1050 + }, + { + "entropy": 0.47586817651987073, + "epoch": 2.9424364123159306, + "grad_norm": 0.25799325108528137, + "learning_rate": 0.00018641305742603172, + "loss": 0.44020862579345704, + "mean_token_accuracy": 0.869452143907547, + "num_tokens": 2707532.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.5092438699305057, + "eval_loss": 0.5377861857414246, + "eval_mean_token_accuracy": 0.8451382353901863, + "eval_num_tokens": 2761311.0, + "eval_runtime": 49.0653, + "eval_samples_per_second": 32.589, + "eval_steps_per_second": 4.076, + "step": 1122 + }, + { + "entropy": 0.44004572521556506, + "epoch": 3.074966532797858, + "grad_norm": 0.31287693977355957, + "learning_rate": 0.0001832643222301409, + "loss": 0.39778636932373046, + "mean_token_accuracy": 0.8794932232962714, + "num_tokens": 2834344.0, + "step": 1150 + }, + { + "entropy": 0.42112498462200165, + "epoch": 3.208835341365462, + "grad_norm": 0.3411082327365875, + "learning_rate": 0.000179944623687242, + "loss": 0.3773982620239258, + "mean_token_accuracy": 0.8827895969152451, + "num_tokens": 2952748.0, + "step": 1200 + }, + { + "entropy": 0.41285495966672897, + "epoch": 3.3427041499330654, + "grad_norm": 0.31819090247154236, + "learning_rate": 0.0001764611900173143, + "loss": 0.3741728210449219, + "mean_token_accuracy": 0.8846501314640045, + "num_tokens": 3080333.0, + "step": 1250 + }, + { + "entropy": 0.42110098838806154, + "epoch": 3.4765729585006695, + "grad_norm": 0.3009640872478485, + "learning_rate": 0.00017282160595268327, + "loss": 0.3816569900512695, + "mean_token_accuracy": 0.8814844501018524, + "num_tokens": 3206920.0, + "step": 1300 + }, + { + "entropy": 0.4269171151518822, + "epoch": 3.610441767068273, + "grad_norm": 0.3651696741580963, + "learning_rate": 0.00016903379622323396, + "loss": 0.38763641357421874, + "mean_token_accuracy": 0.8813142603635788, + "num_tokens": 3327931.0, + "step": 1350 + }, + { + "entropy": 0.4262796178460121, + "epoch": 3.7443105756358768, + "grad_norm": 0.28190597891807556, + "learning_rate": 0.00016510600830132272, + "loss": 0.38563640594482423, + "mean_token_accuracy": 0.8813980734348297, + "num_tokens": 3443874.0, + "step": 1400 + }, + { + "entropy": 0.4200029063224793, + "epoch": 3.878179384203481, + "grad_norm": 0.2584955096244812, + "learning_rate": 0.00016104679444395854, + "loss": 0.3829658508300781, + "mean_token_accuracy": 0.8822885012626648, + "num_tokens": 3573158.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.47917599841952324, + "eval_loss": 0.5421923398971558, + "eval_mean_token_accuracy": 0.8468858799338341, + "eval_num_tokens": 3681748.0, + "eval_runtime": 49.0466, + "eval_samples_per_second": 32.602, + "eval_steps_per_second": 4.078, + "step": 1496 + }, + { + "entropy": 0.42061784803265273, + "epoch": 4.010709504685408, + "grad_norm": 0.3225401043891907, + "learning_rate": 0.0001568649930713548, + "loss": 0.38076282501220704, + "mean_token_accuracy": 0.8825460594109814, + "num_tokens": 3691654.0, + "step": 1500 + }, + { + "entropy": 0.3564541311562061, + "epoch": 4.144578313253012, + "grad_norm": 0.42701128125190735, + "learning_rate": 0.00015256970952239702, + "loss": 0.3080678176879883, + "mean_token_accuracy": 0.9021131205558777, + "num_tokens": 3813863.0, + "step": 1550 + }, + { + "entropy": 0.3623583456873894, + "epoch": 4.278447121820616, + "grad_norm": 0.7542179822921753, + "learning_rate": 0.00014817029622892904, + "loss": 0.31919103622436523, + "mean_token_accuracy": 0.8978805804252624, + "num_tokens": 3942009.0, + "step": 1600 + }, + { + "entropy": 0.35793897867202756, + "epoch": 4.412315930388219, + "grad_norm": 0.3683936595916748, + "learning_rate": 0.0001436763323520266, + "loss": 0.31606245040893555, + "mean_token_accuracy": 0.8989632934331894, + "num_tokens": 4067146.0, + "step": 1650 + }, + { + "entropy": 0.36605307310819624, + "epoch": 4.546184738955823, + "grad_norm": 0.3938419222831726, + "learning_rate": 0.00013909760292459586, + "loss": 0.3214926528930664, + "mean_token_accuracy": 0.897950147986412, + "num_tokens": 4186586.0, + "step": 1700 + }, + { + "entropy": 0.3702411252260208, + "epoch": 4.680053547523427, + "grad_norm": 0.3159140646457672, + "learning_rate": 0.0001344440775457131, + "loss": 0.32606857299804687, + "mean_token_accuracy": 0.8971680045127869, + "num_tokens": 4307001.0, + "step": 1750 + }, + { + "entropy": 0.3695961621403694, + "epoch": 4.813922356091031, + "grad_norm": 0.43663182854652405, + "learning_rate": 0.00012972588867309488, + "loss": 0.324642448425293, + "mean_token_accuracy": 0.8974496972560883, + "num_tokens": 4429513.0, + "step": 1800 + }, + { + "entropy": 0.3658722630143166, + "epoch": 4.947791164658635, + "grad_norm": 0.3189752995967865, + "learning_rate": 0.0001249533095609642, + "loss": 0.3198036575317383, + "mean_token_accuracy": 0.8985732847452164, + "num_tokens": 4557562.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.43428498685359956, + "eval_loss": 0.5698739290237427, + "eval_mean_token_accuracy": 0.8450208070874214, + "eval_num_tokens": 4602185.0, + "eval_runtime": 49.0399, + "eval_samples_per_second": 32.606, + "eval_steps_per_second": 4.078, + "step": 1870 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.415768868260864e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..62108c4c8ad6a925598383a7b5a5345aa9b946e6 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0652985372477836, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj", + "up_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9bb85de81c8a519dff0f58100785ea288245bce9 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/trainer_state.json @@ -0,0 +1,540 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.0, + "eval_steps": 500, + "global_step": 2244, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.8254910862445832, + "epoch": 0.13386880856760375, + "grad_norm": 0.8958511352539062, + "learning_rate": 2.7446846603309888e-05, + "loss": 1.722928009033203, + "mean_token_accuracy": 0.6557231456041336, + "num_tokens": 122643.0, + "step": 50 + }, + { + "entropy": 0.8272530055046081, + "epoch": 0.2677376171352075, + "grad_norm": 0.6162250638008118, + "learning_rate": 5.5453832933217935e-05, + "loss": 0.7761137390136719, + "mean_token_accuracy": 0.795179500579834, + "num_tokens": 245702.0, + "step": 100 + }, + { + "entropy": 0.6629777508974075, + "epoch": 0.40160642570281124, + "grad_norm": 0.43394413590431213, + "learning_rate": 8.346081926312598e-05, + "loss": 0.6302793884277343, + "mean_token_accuracy": 0.8262274640798569, + "num_tokens": 371834.0, + "step": 150 + }, + { + "entropy": 0.6184763962030411, + "epoch": 0.535475234270415, + "grad_norm": 0.4373406767845154, + "learning_rate": 0.00011146780559303404, + "loss": 0.5863075256347656, + "mean_token_accuracy": 0.8357332807779312, + "num_tokens": 500948.0, + "step": 200 + }, + { + "entropy": 0.6116772794723511, + "epoch": 0.6693440428380187, + "grad_norm": 0.6781982183456421, + "learning_rate": 0.00013947479192294207, + "loss": 0.5783074951171875, + "mean_token_accuracy": 0.8368694090843201, + "num_tokens": 621874.0, + "step": 250 + }, + { + "entropy": 0.5848374783992767, + "epoch": 0.8032128514056225, + "grad_norm": 0.3493351340293884, + "learning_rate": 0.00016748177825285014, + "loss": 0.5531037902832031, + "mean_token_accuracy": 0.8434528934955597, + "num_tokens": 746138.0, + "step": 300 + }, + { + "entropy": 0.57043960750103, + "epoch": 0.9370816599732262, + "grad_norm": 0.3953551650047302, + "learning_rate": 0.00019548876458275817, + "loss": 0.5387083053588867, + "mean_token_accuracy": 0.844996885061264, + "num_tokens": 868331.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.6065685012936592, + "eval_loss": 0.5825985074043274, + "eval_mean_token_accuracy": 0.8326057174801826, + "eval_num_tokens": 920437.0, + "eval_runtime": 49.017, + "eval_samples_per_second": 32.621, + "eval_steps_per_second": 4.08, + "step": 374 + }, + { + "entropy": 0.5623676343397661, + "epoch": 1.069611780455154, + "grad_norm": 0.37431156635284424, + "learning_rate": 0.00020946374495076317, + "loss": 0.526231803894043, + "mean_token_accuracy": 0.8500469212580208, + "num_tokens": 980838.0, + "step": 400 + }, + { + "entropy": 0.5420066699385643, + "epoch": 1.2034805890227578, + "grad_norm": 0.3302382826805115, + "learning_rate": 0.00020923573570386192, + "loss": 0.5088261032104492, + "mean_token_accuracy": 0.8518517130613327, + "num_tokens": 1104763.0, + "step": 450 + }, + { + "entropy": 0.5347033357620239, + "epoch": 1.3373493975903614, + "grad_norm": 0.281392902135849, + "learning_rate": 0.00020878021367110025, + "loss": 0.5044374084472656, + "mean_token_accuracy": 0.8548643559217453, + "num_tokens": 1230459.0, + "step": 500 + }, + { + "entropy": 0.5319472518563271, + "epoch": 1.4712182061579653, + "grad_norm": 0.3298404812812805, + "learning_rate": 0.00020809817069357935, + "loss": 0.5011252593994141, + "mean_token_accuracy": 0.8546603417396545, + "num_tokens": 1356368.0, + "step": 550 + }, + { + "entropy": 0.5234624195098877, + "epoch": 1.605087014725569, + "grad_norm": 0.26496022939682007, + "learning_rate": 0.00020719109183285305, + "loss": 0.49288436889648435, + "mean_token_accuracy": 0.8559961414337158, + "num_tokens": 1484569.0, + "step": 600 + }, + { + "entropy": 0.5170091751217842, + "epoch": 1.7389558232931726, + "grad_norm": 0.2679271996021271, + "learning_rate": 0.00020606095213739626, + "loss": 0.4867116165161133, + "mean_token_accuracy": 0.8584025889635086, + "num_tokens": 1609962.0, + "step": 650 + }, + { + "entropy": 0.519580851495266, + "epoch": 1.8728246318607764, + "grad_norm": 0.2783832848072052, + "learning_rate": 0.0002047102123421885, + "loss": 0.4858899688720703, + "mean_token_accuracy": 0.8601382756233216, + "num_tokens": 1729667.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.5516054129600525, + "eval_loss": 0.5468233227729797, + "eval_mean_token_accuracy": 0.8418151989579201, + "eval_num_tokens": 1840874.0, + "eval_runtime": 49.0401, + "eval_samples_per_second": 32.606, + "eval_steps_per_second": 4.078, + "step": 748 + }, + { + "entropy": 0.5179645954960524, + "epoch": 2.005354752342704, + "grad_norm": 0.39848020672798157, + "learning_rate": 0.00020314181351077757, + "loss": 0.4828613662719727, + "mean_token_accuracy": 0.8594950991447525, + "num_tokens": 1846410.0, + "step": 750 + }, + { + "entropy": 0.4724735128879547, + "epoch": 2.139223560910308, + "grad_norm": 0.27450788021087646, + "learning_rate": 0.00020135917063148916, + "loss": 0.43275066375732424, + "mean_token_accuracy": 0.8704854369163513, + "num_tokens": 1971547.0, + "step": 800 + }, + { + "entropy": 0.4807534040510654, + "epoch": 2.2730923694779115, + "grad_norm": 0.3146534562110901, + "learning_rate": 0.00019936616518172531, + "loss": 0.44326435089111327, + "mean_token_accuracy": 0.8670724099874496, + "num_tokens": 2088560.0, + "step": 850 + }, + { + "entropy": 0.4744683504104614, + "epoch": 2.4069611780455156, + "grad_norm": 0.3151724338531494, + "learning_rate": 0.0001971671366765428, + "loss": 0.44036914825439455, + "mean_token_accuracy": 0.8687405550479889, + "num_tokens": 2210146.0, + "step": 900 + }, + { + "entropy": 0.4728820985555649, + "epoch": 2.540829986613119, + "grad_norm": 0.30318814516067505, + "learning_rate": 0.00019476687321991266, + "loss": 0.43707496643066407, + "mean_token_accuracy": 0.8694235664606095, + "num_tokens": 2331354.0, + "step": 950 + }, + { + "entropy": 0.4706392896175384, + "epoch": 2.674698795180723, + "grad_norm": 0.32992425560951233, + "learning_rate": 0.00019217060107923494, + "loss": 0.4379159545898437, + "mean_token_accuracy": 0.8695945852994919, + "num_tokens": 2452558.0, + "step": 1000 + }, + { + "entropy": 0.46944593608379365, + "epoch": 2.8085676037483265, + "grad_norm": 0.23659372329711914, + "learning_rate": 0.0001893839733058082, + "loss": 0.43395462036132815, + "mean_token_accuracy": 0.8707112389802932, + "num_tokens": 2582428.0, + "step": 1050 + }, + { + "entropy": 0.47586817651987073, + "epoch": 2.9424364123159306, + "grad_norm": 0.25799325108528137, + "learning_rate": 0.00018641305742603172, + "loss": 0.44020862579345704, + "mean_token_accuracy": 0.869452143907547, + "num_tokens": 2707532.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.5092438699305057, + "eval_loss": 0.5377861857414246, + "eval_mean_token_accuracy": 0.8451382353901863, + "eval_num_tokens": 2761311.0, + "eval_runtime": 49.0653, + "eval_samples_per_second": 32.589, + "eval_steps_per_second": 4.076, + "step": 1122 + }, + { + "entropy": 0.44004572521556506, + "epoch": 3.074966532797858, + "grad_norm": 0.31287693977355957, + "learning_rate": 0.0001832643222301409, + "loss": 0.39778636932373046, + "mean_token_accuracy": 0.8794932232962714, + "num_tokens": 2834344.0, + "step": 1150 + }, + { + "entropy": 0.42112498462200165, + "epoch": 3.208835341365462, + "grad_norm": 0.3411082327365875, + "learning_rate": 0.000179944623687242, + "loss": 0.3773982620239258, + "mean_token_accuracy": 0.8827895969152451, + "num_tokens": 2952748.0, + "step": 1200 + }, + { + "entropy": 0.41285495966672897, + "epoch": 3.3427041499330654, + "grad_norm": 0.31819090247154236, + "learning_rate": 0.0001764611900173143, + "loss": 0.3741728210449219, + "mean_token_accuracy": 0.8846501314640045, + "num_tokens": 3080333.0, + "step": 1250 + }, + { + "entropy": 0.42110098838806154, + "epoch": 3.4765729585006695, + "grad_norm": 0.3009640872478485, + "learning_rate": 0.00017282160595268327, + "loss": 0.3816569900512695, + "mean_token_accuracy": 0.8814844501018524, + "num_tokens": 3206920.0, + "step": 1300 + }, + { + "entropy": 0.4269171151518822, + "epoch": 3.610441767068273, + "grad_norm": 0.3651696741580963, + "learning_rate": 0.00016903379622323396, + "loss": 0.38763641357421874, + "mean_token_accuracy": 0.8813142603635788, + "num_tokens": 3327931.0, + "step": 1350 + }, + { + "entropy": 0.4262796178460121, + "epoch": 3.7443105756358768, + "grad_norm": 0.28190597891807556, + "learning_rate": 0.00016510600830132272, + "loss": 0.38563640594482423, + "mean_token_accuracy": 0.8813980734348297, + "num_tokens": 3443874.0, + "step": 1400 + }, + { + "entropy": 0.4200029063224793, + "epoch": 3.878179384203481, + "grad_norm": 0.2584955096244812, + "learning_rate": 0.00016104679444395854, + "loss": 0.3829658508300781, + "mean_token_accuracy": 0.8822885012626648, + "num_tokens": 3573158.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.47917599841952324, + "eval_loss": 0.5421923398971558, + "eval_mean_token_accuracy": 0.8468858799338341, + "eval_num_tokens": 3681748.0, + "eval_runtime": 49.0466, + "eval_samples_per_second": 32.602, + "eval_steps_per_second": 4.078, + "step": 1496 + }, + { + "entropy": 0.42061784803265273, + "epoch": 4.010709504685408, + "grad_norm": 0.3225401043891907, + "learning_rate": 0.0001568649930713548, + "loss": 0.38076282501220704, + "mean_token_accuracy": 0.8825460594109814, + "num_tokens": 3691654.0, + "step": 1500 + }, + { + "entropy": 0.3564541311562061, + "epoch": 4.144578313253012, + "grad_norm": 0.42701128125190735, + "learning_rate": 0.00015256970952239702, + "loss": 0.3080678176879883, + "mean_token_accuracy": 0.9021131205558777, + "num_tokens": 3813863.0, + "step": 1550 + }, + { + "entropy": 0.3623583456873894, + "epoch": 4.278447121820616, + "grad_norm": 0.7542179822921753, + "learning_rate": 0.00014817029622892904, + "loss": 0.31919103622436523, + "mean_token_accuracy": 0.8978805804252624, + "num_tokens": 3942009.0, + "step": 1600 + }, + { + "entropy": 0.35793897867202756, + "epoch": 4.412315930388219, + "grad_norm": 0.3683936595916748, + "learning_rate": 0.0001436763323520266, + "loss": 0.31606245040893555, + "mean_token_accuracy": 0.8989632934331894, + "num_tokens": 4067146.0, + "step": 1650 + }, + { + "entropy": 0.36605307310819624, + "epoch": 4.546184738955823, + "grad_norm": 0.3938419222831726, + "learning_rate": 0.00013909760292459586, + "loss": 0.3214926528930664, + "mean_token_accuracy": 0.897950147986412, + "num_tokens": 4186586.0, + "step": 1700 + }, + { + "entropy": 0.3702411252260208, + "epoch": 4.680053547523427, + "grad_norm": 0.3159140646457672, + "learning_rate": 0.0001344440775457131, + "loss": 0.32606857299804687, + "mean_token_accuracy": 0.8971680045127869, + "num_tokens": 4307001.0, + "step": 1750 + }, + { + "entropy": 0.3695961621403694, + "epoch": 4.813922356091031, + "grad_norm": 0.43663182854652405, + "learning_rate": 0.00012972588867309488, + "loss": 0.324642448425293, + "mean_token_accuracy": 0.8974496972560883, + "num_tokens": 4429513.0, + "step": 1800 + }, + { + "entropy": 0.3658722630143166, + "epoch": 4.947791164658635, + "grad_norm": 0.3189752995967865, + "learning_rate": 0.0001249533095609642, + "loss": 0.3198036575317383, + "mean_token_accuracy": 0.8985732847452164, + "num_tokens": 4557562.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.43428498685359956, + "eval_loss": 0.5698739290237427, + "eval_mean_token_accuracy": 0.8450208070874214, + "eval_num_tokens": 4602185.0, + "eval_runtime": 49.0399, + "eval_samples_per_second": 32.606, + "eval_steps_per_second": 4.078, + "step": 1870 + }, + { + "entropy": 0.3230800422454121, + "epoch": 5.080321285140562, + "grad_norm": 0.38153186440467834, + "learning_rate": 0.00012013673189135029, + "loss": 0.2727243995666504, + "mean_token_accuracy": 0.9120446022110756, + "num_tokens": 4679483.0, + "step": 1900 + }, + { + "entropy": 0.2979305517673492, + "epoch": 5.214190093708166, + "grad_norm": 0.42468664050102234, + "learning_rate": 0.00011528664314752708, + "loss": 0.24437490463256836, + "mean_token_accuracy": 0.9198145979642868, + "num_tokens": 4800131.0, + "step": 1950 + }, + { + "entropy": 0.29811844661831854, + "epoch": 5.34805890227577, + "grad_norm": 0.48722052574157715, + "learning_rate": 0.0001104136037788565, + "loss": 0.2472528076171875, + "mean_token_accuracy": 0.9198214167356491, + "num_tokens": 4923382.0, + "step": 2000 + }, + { + "entropy": 0.3036586672067642, + "epoch": 5.481927710843373, + "grad_norm": 0.4003150165081024, + "learning_rate": 0.00010552822420675757, + "loss": 0.2524623489379883, + "mean_token_accuracy": 0.9182902538776397, + "num_tokens": 5042200.0, + "step": 2050 + }, + { + "entropy": 0.30276541873812673, + "epoch": 5.615796519410977, + "grad_norm": 0.5082385540008545, + "learning_rate": 0.00010064114172186765, + "loss": 0.2554252052307129, + "mean_token_accuracy": 0.9163929194211959, + "num_tokens": 5168285.0, + "step": 2100 + }, + { + "entropy": 0.30480867981910703, + "epoch": 5.749665327978581, + "grad_norm": 0.46462953090667725, + "learning_rate": 9.57629973226994e-05, + "loss": 0.25483154296875, + "mean_token_accuracy": 0.916156811118126, + "num_tokens": 5289880.0, + "step": 2150 + }, + { + "entropy": 0.2954915864765644, + "epoch": 5.883534136546185, + "grad_norm": 0.5345046520233154, + "learning_rate": 9.090441254622432e-05, + "loss": 0.24575115203857423, + "mean_token_accuracy": 0.9198049437999726, + "num_tokens": 5413325.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.37039283126592637, + "eval_loss": 0.6287115812301636, + "eval_mean_token_accuracy": 0.8435265091061592, + "eval_num_tokens": 5522622.0, + "eval_runtime": 49.0873, + "eval_samples_per_second": 32.575, + "eval_steps_per_second": 4.074, + "step": 2244 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.895304460277965e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..62108c4c8ad6a925598383a7b5a5345aa9b946e6 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0652985372477836, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj", + "up_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dc9610bcf09b076b4eb42799410e3743e9276540 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/trainer_state.json @@ -0,0 +1,631 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.0, + "eval_steps": 500, + "global_step": 2618, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.8254910862445832, + "epoch": 0.13386880856760375, + "grad_norm": 0.8958511352539062, + "learning_rate": 2.7446846603309888e-05, + "loss": 1.722928009033203, + "mean_token_accuracy": 0.6557231456041336, + "num_tokens": 122643.0, + "step": 50 + }, + { + "entropy": 0.8272530055046081, + "epoch": 0.2677376171352075, + "grad_norm": 0.6162250638008118, + "learning_rate": 5.5453832933217935e-05, + "loss": 0.7761137390136719, + "mean_token_accuracy": 0.795179500579834, + "num_tokens": 245702.0, + "step": 100 + }, + { + "entropy": 0.6629777508974075, + "epoch": 0.40160642570281124, + "grad_norm": 0.43394413590431213, + "learning_rate": 8.346081926312598e-05, + "loss": 0.6302793884277343, + "mean_token_accuracy": 0.8262274640798569, + "num_tokens": 371834.0, + "step": 150 + }, + { + "entropy": 0.6184763962030411, + "epoch": 0.535475234270415, + "grad_norm": 0.4373406767845154, + "learning_rate": 0.00011146780559303404, + "loss": 0.5863075256347656, + "mean_token_accuracy": 0.8357332807779312, + "num_tokens": 500948.0, + "step": 200 + }, + { + "entropy": 0.6116772794723511, + "epoch": 0.6693440428380187, + "grad_norm": 0.6781982183456421, + "learning_rate": 0.00013947479192294207, + "loss": 0.5783074951171875, + "mean_token_accuracy": 0.8368694090843201, + "num_tokens": 621874.0, + "step": 250 + }, + { + "entropy": 0.5848374783992767, + "epoch": 0.8032128514056225, + "grad_norm": 0.3493351340293884, + "learning_rate": 0.00016748177825285014, + "loss": 0.5531037902832031, + "mean_token_accuracy": 0.8434528934955597, + "num_tokens": 746138.0, + "step": 300 + }, + { + "entropy": 0.57043960750103, + "epoch": 0.9370816599732262, + "grad_norm": 0.3953551650047302, + "learning_rate": 0.00019548876458275817, + "loss": 0.5387083053588867, + "mean_token_accuracy": 0.844996885061264, + "num_tokens": 868331.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.6065685012936592, + "eval_loss": 0.5825985074043274, + "eval_mean_token_accuracy": 0.8326057174801826, + "eval_num_tokens": 920437.0, + "eval_runtime": 49.017, + "eval_samples_per_second": 32.621, + "eval_steps_per_second": 4.08, + "step": 374 + }, + { + "entropy": 0.5623676343397661, + "epoch": 1.069611780455154, + "grad_norm": 0.37431156635284424, + "learning_rate": 0.00020946374495076317, + "loss": 0.526231803894043, + "mean_token_accuracy": 0.8500469212580208, + "num_tokens": 980838.0, + "step": 400 + }, + { + "entropy": 0.5420066699385643, + "epoch": 1.2034805890227578, + "grad_norm": 0.3302382826805115, + "learning_rate": 0.00020923573570386192, + "loss": 0.5088261032104492, + "mean_token_accuracy": 0.8518517130613327, + "num_tokens": 1104763.0, + "step": 450 + }, + { + "entropy": 0.5347033357620239, + "epoch": 1.3373493975903614, + "grad_norm": 0.281392902135849, + "learning_rate": 0.00020878021367110025, + "loss": 0.5044374084472656, + "mean_token_accuracy": 0.8548643559217453, + "num_tokens": 1230459.0, + "step": 500 + }, + { + "entropy": 0.5319472518563271, + "epoch": 1.4712182061579653, + "grad_norm": 0.3298404812812805, + "learning_rate": 0.00020809817069357935, + "loss": 0.5011252593994141, + "mean_token_accuracy": 0.8546603417396545, + "num_tokens": 1356368.0, + "step": 550 + }, + { + "entropy": 0.5234624195098877, + "epoch": 1.605087014725569, + "grad_norm": 0.26496022939682007, + "learning_rate": 0.00020719109183285305, + "loss": 0.49288436889648435, + "mean_token_accuracy": 0.8559961414337158, + "num_tokens": 1484569.0, + "step": 600 + }, + { + "entropy": 0.5170091751217842, + "epoch": 1.7389558232931726, + "grad_norm": 0.2679271996021271, + "learning_rate": 0.00020606095213739626, + "loss": 0.4867116165161133, + "mean_token_accuracy": 0.8584025889635086, + "num_tokens": 1609962.0, + "step": 650 + }, + { + "entropy": 0.519580851495266, + "epoch": 1.8728246318607764, + "grad_norm": 0.2783832848072052, + "learning_rate": 0.0002047102123421885, + "loss": 0.4858899688720703, + "mean_token_accuracy": 0.8601382756233216, + "num_tokens": 1729667.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.5516054129600525, + "eval_loss": 0.5468233227729797, + "eval_mean_token_accuracy": 0.8418151989579201, + "eval_num_tokens": 1840874.0, + "eval_runtime": 49.0401, + "eval_samples_per_second": 32.606, + "eval_steps_per_second": 4.078, + "step": 748 + }, + { + "entropy": 0.5179645954960524, + "epoch": 2.005354752342704, + "grad_norm": 0.39848020672798157, + "learning_rate": 0.00020314181351077757, + "loss": 0.4828613662719727, + "mean_token_accuracy": 0.8594950991447525, + "num_tokens": 1846410.0, + "step": 750 + }, + { + "entropy": 0.4724735128879547, + "epoch": 2.139223560910308, + "grad_norm": 0.27450788021087646, + "learning_rate": 0.00020135917063148916, + "loss": 0.43275066375732424, + "mean_token_accuracy": 0.8704854369163513, + "num_tokens": 1971547.0, + "step": 800 + }, + { + "entropy": 0.4807534040510654, + "epoch": 2.2730923694779115, + "grad_norm": 0.3146534562110901, + "learning_rate": 0.00019936616518172531, + "loss": 0.44326435089111327, + "mean_token_accuracy": 0.8670724099874496, + "num_tokens": 2088560.0, + "step": 850 + }, + { + "entropy": 0.4744683504104614, + "epoch": 2.4069611780455156, + "grad_norm": 0.3151724338531494, + "learning_rate": 0.0001971671366765428, + "loss": 0.44036914825439455, + "mean_token_accuracy": 0.8687405550479889, + "num_tokens": 2210146.0, + "step": 900 + }, + { + "entropy": 0.4728820985555649, + "epoch": 2.540829986613119, + "grad_norm": 0.30318814516067505, + "learning_rate": 0.00019476687321991266, + "loss": 0.43707496643066407, + "mean_token_accuracy": 0.8694235664606095, + "num_tokens": 2331354.0, + "step": 950 + }, + { + "entropy": 0.4706392896175384, + "epoch": 2.674698795180723, + "grad_norm": 0.32992425560951233, + "learning_rate": 0.00019217060107923494, + "loss": 0.4379159545898437, + "mean_token_accuracy": 0.8695945852994919, + "num_tokens": 2452558.0, + "step": 1000 + }, + { + "entropy": 0.46944593608379365, + "epoch": 2.8085676037483265, + "grad_norm": 0.23659372329711914, + "learning_rate": 0.0001893839733058082, + "loss": 0.43395462036132815, + "mean_token_accuracy": 0.8707112389802932, + "num_tokens": 2582428.0, + "step": 1050 + }, + { + "entropy": 0.47586817651987073, + "epoch": 2.9424364123159306, + "grad_norm": 0.25799325108528137, + "learning_rate": 0.00018641305742603172, + "loss": 0.44020862579345704, + "mean_token_accuracy": 0.869452143907547, + "num_tokens": 2707532.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.5092438699305057, + "eval_loss": 0.5377861857414246, + "eval_mean_token_accuracy": 0.8451382353901863, + "eval_num_tokens": 2761311.0, + "eval_runtime": 49.0653, + "eval_samples_per_second": 32.589, + "eval_steps_per_second": 4.076, + "step": 1122 + }, + { + "entropy": 0.44004572521556506, + "epoch": 3.074966532797858, + "grad_norm": 0.31287693977355957, + "learning_rate": 0.0001832643222301409, + "loss": 0.39778636932373046, + "mean_token_accuracy": 0.8794932232962714, + "num_tokens": 2834344.0, + "step": 1150 + }, + { + "entropy": 0.42112498462200165, + "epoch": 3.208835341365462, + "grad_norm": 0.3411082327365875, + "learning_rate": 0.000179944623687242, + "loss": 0.3773982620239258, + "mean_token_accuracy": 0.8827895969152451, + "num_tokens": 2952748.0, + "step": 1200 + }, + { + "entropy": 0.41285495966672897, + "epoch": 3.3427041499330654, + "grad_norm": 0.31819090247154236, + "learning_rate": 0.0001764611900173143, + "loss": 0.3741728210449219, + "mean_token_accuracy": 0.8846501314640045, + "num_tokens": 3080333.0, + "step": 1250 + }, + { + "entropy": 0.42110098838806154, + "epoch": 3.4765729585006695, + "grad_norm": 0.3009640872478485, + "learning_rate": 0.00017282160595268327, + "loss": 0.3816569900512695, + "mean_token_accuracy": 0.8814844501018524, + "num_tokens": 3206920.0, + "step": 1300 + }, + { + "entropy": 0.4269171151518822, + "epoch": 3.610441767068273, + "grad_norm": 0.3651696741580963, + "learning_rate": 0.00016903379622323396, + "loss": 0.38763641357421874, + "mean_token_accuracy": 0.8813142603635788, + "num_tokens": 3327931.0, + "step": 1350 + }, + { + "entropy": 0.4262796178460121, + "epoch": 3.7443105756358768, + "grad_norm": 0.28190597891807556, + "learning_rate": 0.00016510600830132272, + "loss": 0.38563640594482423, + "mean_token_accuracy": 0.8813980734348297, + "num_tokens": 3443874.0, + "step": 1400 + }, + { + "entropy": 0.4200029063224793, + "epoch": 3.878179384203481, + "grad_norm": 0.2584955096244812, + "learning_rate": 0.00016104679444395854, + "loss": 0.3829658508300781, + "mean_token_accuracy": 0.8822885012626648, + "num_tokens": 3573158.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.47917599841952324, + "eval_loss": 0.5421923398971558, + "eval_mean_token_accuracy": 0.8468858799338341, + "eval_num_tokens": 3681748.0, + "eval_runtime": 49.0466, + "eval_samples_per_second": 32.602, + "eval_steps_per_second": 4.078, + "step": 1496 + }, + { + "entropy": 0.42061784803265273, + "epoch": 4.010709504685408, + "grad_norm": 0.3225401043891907, + "learning_rate": 0.0001568649930713548, + "loss": 0.38076282501220704, + "mean_token_accuracy": 0.8825460594109814, + "num_tokens": 3691654.0, + "step": 1500 + }, + { + "entropy": 0.3564541311562061, + "epoch": 4.144578313253012, + "grad_norm": 0.42701128125190735, + "learning_rate": 0.00015256970952239702, + "loss": 0.3080678176879883, + "mean_token_accuracy": 0.9021131205558777, + "num_tokens": 3813863.0, + "step": 1550 + }, + { + "entropy": 0.3623583456873894, + "epoch": 4.278447121820616, + "grad_norm": 0.7542179822921753, + "learning_rate": 0.00014817029622892904, + "loss": 0.31919103622436523, + "mean_token_accuracy": 0.8978805804252624, + "num_tokens": 3942009.0, + "step": 1600 + }, + { + "entropy": 0.35793897867202756, + "epoch": 4.412315930388219, + "grad_norm": 0.3683936595916748, + "learning_rate": 0.0001436763323520266, + "loss": 0.31606245040893555, + "mean_token_accuracy": 0.8989632934331894, + "num_tokens": 4067146.0, + "step": 1650 + }, + { + "entropy": 0.36605307310819624, + "epoch": 4.546184738955823, + "grad_norm": 0.3938419222831726, + "learning_rate": 0.00013909760292459586, + "loss": 0.3214926528930664, + "mean_token_accuracy": 0.897950147986412, + "num_tokens": 4186586.0, + "step": 1700 + }, + { + "entropy": 0.3702411252260208, + "epoch": 4.680053547523427, + "grad_norm": 0.3159140646457672, + "learning_rate": 0.0001344440775457131, + "loss": 0.32606857299804687, + "mean_token_accuracy": 0.8971680045127869, + "num_tokens": 4307001.0, + "step": 1750 + }, + { + "entropy": 0.3695961621403694, + "epoch": 4.813922356091031, + "grad_norm": 0.43663182854652405, + "learning_rate": 0.00012972588867309488, + "loss": 0.324642448425293, + "mean_token_accuracy": 0.8974496972560883, + "num_tokens": 4429513.0, + "step": 1800 + }, + { + "entropy": 0.3658722630143166, + "epoch": 4.947791164658635, + "grad_norm": 0.3189752995967865, + "learning_rate": 0.0001249533095609642, + "loss": 0.3198036575317383, + "mean_token_accuracy": 0.8985732847452164, + "num_tokens": 4557562.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.43428498685359956, + "eval_loss": 0.5698739290237427, + "eval_mean_token_accuracy": 0.8450208070874214, + "eval_num_tokens": 4602185.0, + "eval_runtime": 49.0399, + "eval_samples_per_second": 32.606, + "eval_steps_per_second": 4.078, + "step": 1870 + }, + { + "entropy": 0.3230800422454121, + "epoch": 5.080321285140562, + "grad_norm": 0.38153186440467834, + "learning_rate": 0.00012013673189135029, + "loss": 0.2727243995666504, + "mean_token_accuracy": 0.9120446022110756, + "num_tokens": 4679483.0, + "step": 1900 + }, + { + "entropy": 0.2979305517673492, + "epoch": 5.214190093708166, + "grad_norm": 0.42468664050102234, + "learning_rate": 0.00011528664314752708, + "loss": 0.24437490463256836, + "mean_token_accuracy": 0.9198145979642868, + "num_tokens": 4800131.0, + "step": 1950 + }, + { + "entropy": 0.29811844661831854, + "epoch": 5.34805890227577, + "grad_norm": 0.48722052574157715, + "learning_rate": 0.0001104136037788565, + "loss": 0.2472528076171875, + "mean_token_accuracy": 0.9198214167356491, + "num_tokens": 4923382.0, + "step": 2000 + }, + { + "entropy": 0.3036586672067642, + "epoch": 5.481927710843373, + "grad_norm": 0.4003150165081024, + "learning_rate": 0.00010552822420675757, + "loss": 0.2524623489379883, + "mean_token_accuracy": 0.9182902538776397, + "num_tokens": 5042200.0, + "step": 2050 + }, + { + "entropy": 0.30276541873812673, + "epoch": 5.615796519410977, + "grad_norm": 0.5082385540008545, + "learning_rate": 0.00010064114172186765, + "loss": 0.2554252052307129, + "mean_token_accuracy": 0.9163929194211959, + "num_tokens": 5168285.0, + "step": 2100 + }, + { + "entropy": 0.30480867981910703, + "epoch": 5.749665327978581, + "grad_norm": 0.46462953090667725, + "learning_rate": 9.57629973226994e-05, + "loss": 0.25483154296875, + "mean_token_accuracy": 0.916156811118126, + "num_tokens": 5289880.0, + "step": 2150 + }, + { + "entropy": 0.2954915864765644, + "epoch": 5.883534136546185, + "grad_norm": 0.5345046520233154, + "learning_rate": 9.090441254622432e-05, + "loss": 0.24575115203857423, + "mean_token_accuracy": 0.9198049437999726, + "num_tokens": 5413325.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.37039283126592637, + "eval_loss": 0.6287115812301636, + "eval_mean_token_accuracy": 0.8435265091061592, + "eval_num_tokens": 5522622.0, + "eval_runtime": 49.0873, + "eval_samples_per_second": 32.575, + "eval_steps_per_second": 4.074, + "step": 2244 + }, + { + "entropy": 0.2953245359839815, + "epoch": 6.016064257028113, + "grad_norm": 0.4055909216403961, + "learning_rate": 8.607596634083136e-05, + "loss": 0.24116868972778321, + "mean_token_accuracy": 0.9220171468426482, + "num_tokens": 5536511.0, + "step": 2250 + }, + { + "entropy": 0.229744790494442, + "epoch": 6.149933065595716, + "grad_norm": 0.4221028983592987, + "learning_rate": 8.128817203201665e-05, + "loss": 0.1732115364074707, + "mean_token_accuracy": 0.9427249735593796, + "num_tokens": 5663054.0, + "step": 2300 + }, + { + "entropy": 0.23100735485553742, + "epoch": 6.28380187416332, + "grad_norm": 0.5276848673820496, + "learning_rate": 7.655145443095877e-05, + "loss": 0.1742458724975586, + "mean_token_accuracy": 0.9424393928050995, + "num_tokens": 5790379.0, + "step": 2350 + }, + { + "entropy": 0.2334547135233879, + "epoch": 6.417670682730924, + "grad_norm": 0.5443829298019409, + "learning_rate": 7.187612713582257e-05, + "loss": 0.17723684310913085, + "mean_token_accuracy": 0.9421556174755097, + "num_tokens": 5908852.0, + "step": 2400 + }, + { + "entropy": 0.22672353580594062, + "epoch": 6.551539491298527, + "grad_norm": 0.4869251251220703, + "learning_rate": 6.727237007521524e-05, + "loss": 0.17469547271728517, + "mean_token_accuracy": 0.9419155931472778, + "num_tokens": 6033966.0, + "step": 2450 + }, + { + "entropy": 0.23190354615449904, + "epoch": 6.685408299866131, + "grad_norm": 0.5983045697212219, + "learning_rate": 6.275020734269083e-05, + "loss": 0.17733327865600587, + "mean_token_accuracy": 0.9419048410654068, + "num_tokens": 6156278.0, + "step": 2500 + }, + { + "entropy": 0.23502119958400727, + "epoch": 6.8192771084337345, + "grad_norm": 0.5186291337013245, + "learning_rate": 5.831948537056545e-05, + "loss": 0.18074512481689453, + "mean_token_accuracy": 0.9402925485372543, + "num_tokens": 6276774.0, + "step": 2550 + }, + { + "entropy": 0.23055340006947517, + "epoch": 6.953145917001339, + "grad_norm": 0.48175379633903503, + "learning_rate": 5.3989851490567374e-05, + "loss": 0.17573400497436523, + "mean_token_accuracy": 0.9420478469133378, + "num_tokens": 6401444.0, + "step": 2600 + }, + { + "epoch": 7.0, + "eval_entropy": 0.3201144814491272, + "eval_loss": 0.7185283899307251, + "eval_mean_token_accuracy": 0.8399944826960564, + "eval_num_tokens": 6443059.0, + "eval_runtime": 49.0868, + "eval_samples_per_second": 32.575, + "eval_steps_per_second": 4.074, + "step": 2618 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.380063552309576e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..62108c4c8ad6a925598383a7b5a5345aa9b946e6 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0652985372477836, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj", + "up_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0f8a9cf31bf0398cf6faaac0d8c73b5273551089 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/trainer_state.json @@ -0,0 +1,712 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.0, + "eval_steps": 500, + "global_step": 2992, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.8254910862445832, + "epoch": 0.13386880856760375, + "grad_norm": 0.8958511352539062, + "learning_rate": 2.7446846603309888e-05, + "loss": 1.722928009033203, + "mean_token_accuracy": 0.6557231456041336, + "num_tokens": 122643.0, + "step": 50 + }, + { + "entropy": 0.8272530055046081, + "epoch": 0.2677376171352075, + "grad_norm": 0.6162250638008118, + "learning_rate": 5.5453832933217935e-05, + "loss": 0.7761137390136719, + "mean_token_accuracy": 0.795179500579834, + "num_tokens": 245702.0, + "step": 100 + }, + { + "entropy": 0.6629777508974075, + "epoch": 0.40160642570281124, + "grad_norm": 0.43394413590431213, + "learning_rate": 8.346081926312598e-05, + "loss": 0.6302793884277343, + "mean_token_accuracy": 0.8262274640798569, + "num_tokens": 371834.0, + "step": 150 + }, + { + "entropy": 0.6184763962030411, + "epoch": 0.535475234270415, + "grad_norm": 0.4373406767845154, + "learning_rate": 0.00011146780559303404, + "loss": 0.5863075256347656, + "mean_token_accuracy": 0.8357332807779312, + "num_tokens": 500948.0, + "step": 200 + }, + { + "entropy": 0.6116772794723511, + "epoch": 0.6693440428380187, + "grad_norm": 0.6781982183456421, + "learning_rate": 0.00013947479192294207, + "loss": 0.5783074951171875, + "mean_token_accuracy": 0.8368694090843201, + "num_tokens": 621874.0, + "step": 250 + }, + { + "entropy": 0.5848374783992767, + "epoch": 0.8032128514056225, + "grad_norm": 0.3493351340293884, + "learning_rate": 0.00016748177825285014, + "loss": 0.5531037902832031, + "mean_token_accuracy": 0.8434528934955597, + "num_tokens": 746138.0, + "step": 300 + }, + { + "entropy": 0.57043960750103, + "epoch": 0.9370816599732262, + "grad_norm": 0.3953551650047302, + "learning_rate": 0.00019548876458275817, + "loss": 0.5387083053588867, + "mean_token_accuracy": 0.844996885061264, + "num_tokens": 868331.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.6065685012936592, + "eval_loss": 0.5825985074043274, + "eval_mean_token_accuracy": 0.8326057174801826, + "eval_num_tokens": 920437.0, + "eval_runtime": 49.017, + "eval_samples_per_second": 32.621, + "eval_steps_per_second": 4.08, + "step": 374 + }, + { + "entropy": 0.5623676343397661, + "epoch": 1.069611780455154, + "grad_norm": 0.37431156635284424, + "learning_rate": 0.00020946374495076317, + "loss": 0.526231803894043, + "mean_token_accuracy": 0.8500469212580208, + "num_tokens": 980838.0, + "step": 400 + }, + { + "entropy": 0.5420066699385643, + "epoch": 1.2034805890227578, + "grad_norm": 0.3302382826805115, + "learning_rate": 0.00020923573570386192, + "loss": 0.5088261032104492, + "mean_token_accuracy": 0.8518517130613327, + "num_tokens": 1104763.0, + "step": 450 + }, + { + "entropy": 0.5347033357620239, + "epoch": 1.3373493975903614, + "grad_norm": 0.281392902135849, + "learning_rate": 0.00020878021367110025, + "loss": 0.5044374084472656, + "mean_token_accuracy": 0.8548643559217453, + "num_tokens": 1230459.0, + "step": 500 + }, + { + "entropy": 0.5319472518563271, + "epoch": 1.4712182061579653, + "grad_norm": 0.3298404812812805, + "learning_rate": 0.00020809817069357935, + "loss": 0.5011252593994141, + "mean_token_accuracy": 0.8546603417396545, + "num_tokens": 1356368.0, + "step": 550 + }, + { + "entropy": 0.5234624195098877, + "epoch": 1.605087014725569, + "grad_norm": 0.26496022939682007, + "learning_rate": 0.00020719109183285305, + "loss": 0.49288436889648435, + "mean_token_accuracy": 0.8559961414337158, + "num_tokens": 1484569.0, + "step": 600 + }, + { + "entropy": 0.5170091751217842, + "epoch": 1.7389558232931726, + "grad_norm": 0.2679271996021271, + "learning_rate": 0.00020606095213739626, + "loss": 0.4867116165161133, + "mean_token_accuracy": 0.8584025889635086, + "num_tokens": 1609962.0, + "step": 650 + }, + { + "entropy": 0.519580851495266, + "epoch": 1.8728246318607764, + "grad_norm": 0.2783832848072052, + "learning_rate": 0.0002047102123421885, + "loss": 0.4858899688720703, + "mean_token_accuracy": 0.8601382756233216, + "num_tokens": 1729667.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.5516054129600525, + "eval_loss": 0.5468233227729797, + "eval_mean_token_accuracy": 0.8418151989579201, + "eval_num_tokens": 1840874.0, + "eval_runtime": 49.0401, + "eval_samples_per_second": 32.606, + "eval_steps_per_second": 4.078, + "step": 748 + }, + { + "entropy": 0.5179645954960524, + "epoch": 2.005354752342704, + "grad_norm": 0.39848020672798157, + "learning_rate": 0.00020314181351077757, + "loss": 0.4828613662719727, + "mean_token_accuracy": 0.8594950991447525, + "num_tokens": 1846410.0, + "step": 750 + }, + { + "entropy": 0.4724735128879547, + "epoch": 2.139223560910308, + "grad_norm": 0.27450788021087646, + "learning_rate": 0.00020135917063148916, + "loss": 0.43275066375732424, + "mean_token_accuracy": 0.8704854369163513, + "num_tokens": 1971547.0, + "step": 800 + }, + { + "entropy": 0.4807534040510654, + "epoch": 2.2730923694779115, + "grad_norm": 0.3146534562110901, + "learning_rate": 0.00019936616518172531, + "loss": 0.44326435089111327, + "mean_token_accuracy": 0.8670724099874496, + "num_tokens": 2088560.0, + "step": 850 + }, + { + "entropy": 0.4744683504104614, + "epoch": 2.4069611780455156, + "grad_norm": 0.3151724338531494, + "learning_rate": 0.0001971671366765428, + "loss": 0.44036914825439455, + "mean_token_accuracy": 0.8687405550479889, + "num_tokens": 2210146.0, + "step": 900 + }, + { + "entropy": 0.4728820985555649, + "epoch": 2.540829986613119, + "grad_norm": 0.30318814516067505, + "learning_rate": 0.00019476687321991266, + "loss": 0.43707496643066407, + "mean_token_accuracy": 0.8694235664606095, + "num_tokens": 2331354.0, + "step": 950 + }, + { + "entropy": 0.4706392896175384, + "epoch": 2.674698795180723, + "grad_norm": 0.32992425560951233, + "learning_rate": 0.00019217060107923494, + "loss": 0.4379159545898437, + "mean_token_accuracy": 0.8695945852994919, + "num_tokens": 2452558.0, + "step": 1000 + }, + { + "entropy": 0.46944593608379365, + "epoch": 2.8085676037483265, + "grad_norm": 0.23659372329711914, + "learning_rate": 0.0001893839733058082, + "loss": 0.43395462036132815, + "mean_token_accuracy": 0.8707112389802932, + "num_tokens": 2582428.0, + "step": 1050 + }, + { + "entropy": 0.47586817651987073, + "epoch": 2.9424364123159306, + "grad_norm": 0.25799325108528137, + "learning_rate": 0.00018641305742603172, + "loss": 0.44020862579345704, + "mean_token_accuracy": 0.869452143907547, + "num_tokens": 2707532.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.5092438699305057, + "eval_loss": 0.5377861857414246, + "eval_mean_token_accuracy": 0.8451382353901863, + "eval_num_tokens": 2761311.0, + "eval_runtime": 49.0653, + "eval_samples_per_second": 32.589, + "eval_steps_per_second": 4.076, + "step": 1122 + }, + { + "entropy": 0.44004572521556506, + "epoch": 3.074966532797858, + "grad_norm": 0.31287693977355957, + "learning_rate": 0.0001832643222301409, + "loss": 0.39778636932373046, + "mean_token_accuracy": 0.8794932232962714, + "num_tokens": 2834344.0, + "step": 1150 + }, + { + "entropy": 0.42112498462200165, + "epoch": 3.208835341365462, + "grad_norm": 0.3411082327365875, + "learning_rate": 0.000179944623687242, + "loss": 0.3773982620239258, + "mean_token_accuracy": 0.8827895969152451, + "num_tokens": 2952748.0, + "step": 1200 + }, + { + "entropy": 0.41285495966672897, + "epoch": 3.3427041499330654, + "grad_norm": 0.31819090247154236, + "learning_rate": 0.0001764611900173143, + "loss": 0.3741728210449219, + "mean_token_accuracy": 0.8846501314640045, + "num_tokens": 3080333.0, + "step": 1250 + }, + { + "entropy": 0.42110098838806154, + "epoch": 3.4765729585006695, + "grad_norm": 0.3009640872478485, + "learning_rate": 0.00017282160595268327, + "loss": 0.3816569900512695, + "mean_token_accuracy": 0.8814844501018524, + "num_tokens": 3206920.0, + "step": 1300 + }, + { + "entropy": 0.4269171151518822, + "epoch": 3.610441767068273, + "grad_norm": 0.3651696741580963, + "learning_rate": 0.00016903379622323396, + "loss": 0.38763641357421874, + "mean_token_accuracy": 0.8813142603635788, + "num_tokens": 3327931.0, + "step": 1350 + }, + { + "entropy": 0.4262796178460121, + "epoch": 3.7443105756358768, + "grad_norm": 0.28190597891807556, + "learning_rate": 0.00016510600830132272, + "loss": 0.38563640594482423, + "mean_token_accuracy": 0.8813980734348297, + "num_tokens": 3443874.0, + "step": 1400 + }, + { + "entropy": 0.4200029063224793, + "epoch": 3.878179384203481, + "grad_norm": 0.2584955096244812, + "learning_rate": 0.00016104679444395854, + "loss": 0.3829658508300781, + "mean_token_accuracy": 0.8822885012626648, + "num_tokens": 3573158.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.47917599841952324, + "eval_loss": 0.5421923398971558, + "eval_mean_token_accuracy": 0.8468858799338341, + "eval_num_tokens": 3681748.0, + "eval_runtime": 49.0466, + "eval_samples_per_second": 32.602, + "eval_steps_per_second": 4.078, + "step": 1496 + }, + { + "entropy": 0.42061784803265273, + "epoch": 4.010709504685408, + "grad_norm": 0.3225401043891907, + "learning_rate": 0.0001568649930713548, + "loss": 0.38076282501220704, + "mean_token_accuracy": 0.8825460594109814, + "num_tokens": 3691654.0, + "step": 1500 + }, + { + "entropy": 0.3564541311562061, + "epoch": 4.144578313253012, + "grad_norm": 0.42701128125190735, + "learning_rate": 0.00015256970952239702, + "loss": 0.3080678176879883, + "mean_token_accuracy": 0.9021131205558777, + "num_tokens": 3813863.0, + "step": 1550 + }, + { + "entropy": 0.3623583456873894, + "epoch": 4.278447121820616, + "grad_norm": 0.7542179822921753, + "learning_rate": 0.00014817029622892904, + "loss": 0.31919103622436523, + "mean_token_accuracy": 0.8978805804252624, + "num_tokens": 3942009.0, + "step": 1600 + }, + { + "entropy": 0.35793897867202756, + "epoch": 4.412315930388219, + "grad_norm": 0.3683936595916748, + "learning_rate": 0.0001436763323520266, + "loss": 0.31606245040893555, + "mean_token_accuracy": 0.8989632934331894, + "num_tokens": 4067146.0, + "step": 1650 + }, + { + "entropy": 0.36605307310819624, + "epoch": 4.546184738955823, + "grad_norm": 0.3938419222831726, + "learning_rate": 0.00013909760292459586, + "loss": 0.3214926528930664, + "mean_token_accuracy": 0.897950147986412, + "num_tokens": 4186586.0, + "step": 1700 + }, + { + "entropy": 0.3702411252260208, + "epoch": 4.680053547523427, + "grad_norm": 0.3159140646457672, + "learning_rate": 0.0001344440775457131, + "loss": 0.32606857299804687, + "mean_token_accuracy": 0.8971680045127869, + "num_tokens": 4307001.0, + "step": 1750 + }, + { + "entropy": 0.3695961621403694, + "epoch": 4.813922356091031, + "grad_norm": 0.43663182854652405, + "learning_rate": 0.00012972588867309488, + "loss": 0.324642448425293, + "mean_token_accuracy": 0.8974496972560883, + "num_tokens": 4429513.0, + "step": 1800 + }, + { + "entropy": 0.3658722630143166, + "epoch": 4.947791164658635, + "grad_norm": 0.3189752995967865, + "learning_rate": 0.0001249533095609642, + "loss": 0.3198036575317383, + "mean_token_accuracy": 0.8985732847452164, + "num_tokens": 4557562.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.43428498685359956, + "eval_loss": 0.5698739290237427, + "eval_mean_token_accuracy": 0.8450208070874214, + "eval_num_tokens": 4602185.0, + "eval_runtime": 49.0399, + "eval_samples_per_second": 32.606, + "eval_steps_per_second": 4.078, + "step": 1870 + }, + { + "entropy": 0.3230800422454121, + "epoch": 5.080321285140562, + "grad_norm": 0.38153186440467834, + "learning_rate": 0.00012013673189135029, + "loss": 0.2727243995666504, + "mean_token_accuracy": 0.9120446022110756, + "num_tokens": 4679483.0, + "step": 1900 + }, + { + "entropy": 0.2979305517673492, + "epoch": 5.214190093708166, + "grad_norm": 0.42468664050102234, + "learning_rate": 0.00011528664314752708, + "loss": 0.24437490463256836, + "mean_token_accuracy": 0.9198145979642868, + "num_tokens": 4800131.0, + "step": 1950 + }, + { + "entropy": 0.29811844661831854, + "epoch": 5.34805890227577, + "grad_norm": 0.48722052574157715, + "learning_rate": 0.0001104136037788565, + "loss": 0.2472528076171875, + "mean_token_accuracy": 0.9198214167356491, + "num_tokens": 4923382.0, + "step": 2000 + }, + { + "entropy": 0.3036586672067642, + "epoch": 5.481927710843373, + "grad_norm": 0.4003150165081024, + "learning_rate": 0.00010552822420675757, + "loss": 0.2524623489379883, + "mean_token_accuracy": 0.9182902538776397, + "num_tokens": 5042200.0, + "step": 2050 + }, + { + "entropy": 0.30276541873812673, + "epoch": 5.615796519410977, + "grad_norm": 0.5082385540008545, + "learning_rate": 0.00010064114172186765, + "loss": 0.2554252052307129, + "mean_token_accuracy": 0.9163929194211959, + "num_tokens": 5168285.0, + "step": 2100 + }, + { + "entropy": 0.30480867981910703, + "epoch": 5.749665327978581, + "grad_norm": 0.46462953090667725, + "learning_rate": 9.57629973226994e-05, + "loss": 0.25483154296875, + "mean_token_accuracy": 0.916156811118126, + "num_tokens": 5289880.0, + "step": 2150 + }, + { + "entropy": 0.2954915864765644, + "epoch": 5.883534136546185, + "grad_norm": 0.5345046520233154, + "learning_rate": 9.090441254622432e-05, + "loss": 0.24575115203857423, + "mean_token_accuracy": 0.9198049437999726, + "num_tokens": 5413325.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.37039283126592637, + "eval_loss": 0.6287115812301636, + "eval_mean_token_accuracy": 0.8435265091061592, + "eval_num_tokens": 5522622.0, + "eval_runtime": 49.0873, + "eval_samples_per_second": 32.575, + "eval_steps_per_second": 4.074, + "step": 2244 + }, + { + "entropy": 0.2953245359839815, + "epoch": 6.016064257028113, + "grad_norm": 0.4055909216403961, + "learning_rate": 8.607596634083136e-05, + "loss": 0.24116868972778321, + "mean_token_accuracy": 0.9220171468426482, + "num_tokens": 5536511.0, + "step": 2250 + }, + { + "entropy": 0.229744790494442, + "epoch": 6.149933065595716, + "grad_norm": 0.4221028983592987, + "learning_rate": 8.128817203201665e-05, + "loss": 0.1732115364074707, + "mean_token_accuracy": 0.9427249735593796, + "num_tokens": 5663054.0, + "step": 2300 + }, + { + "entropy": 0.23100735485553742, + "epoch": 6.28380187416332, + "grad_norm": 0.5276848673820496, + "learning_rate": 7.655145443095877e-05, + "loss": 0.1742458724975586, + "mean_token_accuracy": 0.9424393928050995, + "num_tokens": 5790379.0, + "step": 2350 + }, + { + "entropy": 0.2334547135233879, + "epoch": 6.417670682730924, + "grad_norm": 0.5443829298019409, + "learning_rate": 7.187612713582257e-05, + "loss": 0.17723684310913085, + "mean_token_accuracy": 0.9421556174755097, + "num_tokens": 5908852.0, + "step": 2400 + }, + { + "entropy": 0.22672353580594062, + "epoch": 6.551539491298527, + "grad_norm": 0.4869251251220703, + "learning_rate": 6.727237007521524e-05, + "loss": 0.17469547271728517, + "mean_token_accuracy": 0.9419155931472778, + "num_tokens": 6033966.0, + "step": 2450 + }, + { + "entropy": 0.23190354615449904, + "epoch": 6.685408299866131, + "grad_norm": 0.5983045697212219, + "learning_rate": 6.275020734269083e-05, + "loss": 0.17733327865600587, + "mean_token_accuracy": 0.9419048410654068, + "num_tokens": 6156278.0, + "step": 2500 + }, + { + "entropy": 0.23502119958400727, + "epoch": 6.8192771084337345, + "grad_norm": 0.5186291337013245, + "learning_rate": 5.831948537056545e-05, + "loss": 0.18074512481689453, + "mean_token_accuracy": 0.9402925485372543, + "num_tokens": 6276774.0, + "step": 2550 + }, + { + "entropy": 0.23055340006947517, + "epoch": 6.953145917001339, + "grad_norm": 0.48175379633903503, + "learning_rate": 5.3989851490567374e-05, + "loss": 0.17573400497436523, + "mean_token_accuracy": 0.9420478469133378, + "num_tokens": 6401444.0, + "step": 2600 + }, + { + "epoch": 7.0, + "eval_entropy": 0.3201144814491272, + "eval_loss": 0.7185283899307251, + "eval_mean_token_accuracy": 0.8399944826960564, + "eval_num_tokens": 6443059.0, + "eval_runtime": 49.0868, + "eval_samples_per_second": 32.575, + "eval_steps_per_second": 4.074, + "step": 2618 + }, + { + "entropy": 0.19884085278920452, + "epoch": 7.085676037483267, + "grad_norm": 0.4999229311943054, + "learning_rate": 4.977073292800337e-05, + "loss": 0.13776198387145996, + "mean_token_accuracy": 0.95504442369095, + "num_tokens": 6525142.0, + "step": 2650 + }, + { + "entropy": 0.17382358580827714, + "epoch": 7.21954484605087, + "grad_norm": 0.44681516289711, + "learning_rate": 4.567131627517827e-05, + "loss": 0.1151345157623291, + "mean_token_accuracy": 0.9624496775865555, + "num_tokens": 6651930.0, + "step": 2700 + }, + { + "entropy": 0.17993666499853134, + "epoch": 7.353413654618474, + "grad_norm": 0.47544270753860474, + "learning_rate": 4.1700527488762594e-05, + "loss": 0.12008686065673828, + "mean_token_accuracy": 0.9607802790403366, + "num_tokens": 6768469.0, + "step": 2750 + }, + { + "entropy": 0.17229609042406083, + "epoch": 7.4872824631860775, + "grad_norm": 0.4898432791233063, + "learning_rate": 3.786701245466089e-05, + "loss": 0.1164663314819336, + "mean_token_accuracy": 0.9622354304790497, + "num_tokens": 6892532.0, + "step": 2800 + }, + { + "entropy": 0.16839693702757358, + "epoch": 7.621151271753681, + "grad_norm": 0.5927155613899231, + "learning_rate": 3.417911816269838e-05, + "loss": 0.1138334846496582, + "mean_token_accuracy": 0.9632772338390351, + "num_tokens": 7023373.0, + "step": 2850 + }, + { + "entropy": 0.173521406725049, + "epoch": 7.755020080321285, + "grad_norm": 0.5407077670097351, + "learning_rate": 3.0644874532115575e-05, + "loss": 0.11670659065246582, + "mean_token_accuracy": 0.9622769457101822, + "num_tokens": 7146448.0, + "step": 2900 + }, + { + "entropy": 0.17213394075632096, + "epoch": 7.888888888888889, + "grad_norm": 0.48647794127464294, + "learning_rate": 2.727197692744389e-05, + "loss": 0.11715221405029297, + "mean_token_accuracy": 0.9625237709283829, + "num_tokens": 7267680.0, + "step": 2950 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2757616487890482, + "eval_loss": 0.8189995884895325, + "eval_mean_token_accuracy": 0.8390460336208343, + "eval_num_tokens": 7363496.0, + "eval_runtime": 49.0999, + "eval_samples_per_second": 32.566, + "eval_steps_per_second": 4.073, + "step": 2992 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.861809120790405e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..62108c4c8ad6a925598383a7b5a5345aa9b946e6 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0652985372477836, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj", + "up_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c5337300fd7964cff13080074b6e5a58680c6bba --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/trainer_state.json @@ -0,0 +1,803 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.0, + "eval_steps": 500, + "global_step": 3366, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.8254910862445832, + "epoch": 0.13386880856760375, + "grad_norm": 0.8958511352539062, + "learning_rate": 2.7446846603309888e-05, + "loss": 1.722928009033203, + "mean_token_accuracy": 0.6557231456041336, + "num_tokens": 122643.0, + "step": 50 + }, + { + "entropy": 0.8272530055046081, + "epoch": 0.2677376171352075, + "grad_norm": 0.6162250638008118, + "learning_rate": 5.5453832933217935e-05, + "loss": 0.7761137390136719, + "mean_token_accuracy": 0.795179500579834, + "num_tokens": 245702.0, + "step": 100 + }, + { + "entropy": 0.6629777508974075, + "epoch": 0.40160642570281124, + "grad_norm": 0.43394413590431213, + "learning_rate": 8.346081926312598e-05, + "loss": 0.6302793884277343, + "mean_token_accuracy": 0.8262274640798569, + "num_tokens": 371834.0, + "step": 150 + }, + { + "entropy": 0.6184763962030411, + "epoch": 0.535475234270415, + "grad_norm": 0.4373406767845154, + "learning_rate": 0.00011146780559303404, + "loss": 0.5863075256347656, + "mean_token_accuracy": 0.8357332807779312, + "num_tokens": 500948.0, + "step": 200 + }, + { + "entropy": 0.6116772794723511, + "epoch": 0.6693440428380187, + "grad_norm": 0.6781982183456421, + "learning_rate": 0.00013947479192294207, + "loss": 0.5783074951171875, + "mean_token_accuracy": 0.8368694090843201, + "num_tokens": 621874.0, + "step": 250 + }, + { + "entropy": 0.5848374783992767, + "epoch": 0.8032128514056225, + "grad_norm": 0.3493351340293884, + "learning_rate": 0.00016748177825285014, + "loss": 0.5531037902832031, + "mean_token_accuracy": 0.8434528934955597, + "num_tokens": 746138.0, + "step": 300 + }, + { + "entropy": 0.57043960750103, + "epoch": 0.9370816599732262, + "grad_norm": 0.3953551650047302, + "learning_rate": 0.00019548876458275817, + "loss": 0.5387083053588867, + "mean_token_accuracy": 0.844996885061264, + "num_tokens": 868331.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.6065685012936592, + "eval_loss": 0.5825985074043274, + "eval_mean_token_accuracy": 0.8326057174801826, + "eval_num_tokens": 920437.0, + "eval_runtime": 49.017, + "eval_samples_per_second": 32.621, + "eval_steps_per_second": 4.08, + "step": 374 + }, + { + "entropy": 0.5623676343397661, + "epoch": 1.069611780455154, + "grad_norm": 0.37431156635284424, + "learning_rate": 0.00020946374495076317, + "loss": 0.526231803894043, + "mean_token_accuracy": 0.8500469212580208, + "num_tokens": 980838.0, + "step": 400 + }, + { + "entropy": 0.5420066699385643, + "epoch": 1.2034805890227578, + "grad_norm": 0.3302382826805115, + "learning_rate": 0.00020923573570386192, + "loss": 0.5088261032104492, + "mean_token_accuracy": 0.8518517130613327, + "num_tokens": 1104763.0, + "step": 450 + }, + { + "entropy": 0.5347033357620239, + "epoch": 1.3373493975903614, + "grad_norm": 0.281392902135849, + "learning_rate": 0.00020878021367110025, + "loss": 0.5044374084472656, + "mean_token_accuracy": 0.8548643559217453, + "num_tokens": 1230459.0, + "step": 500 + }, + { + "entropy": 0.5319472518563271, + "epoch": 1.4712182061579653, + "grad_norm": 0.3298404812812805, + "learning_rate": 0.00020809817069357935, + "loss": 0.5011252593994141, + "mean_token_accuracy": 0.8546603417396545, + "num_tokens": 1356368.0, + "step": 550 + }, + { + "entropy": 0.5234624195098877, + "epoch": 1.605087014725569, + "grad_norm": 0.26496022939682007, + "learning_rate": 0.00020719109183285305, + "loss": 0.49288436889648435, + "mean_token_accuracy": 0.8559961414337158, + "num_tokens": 1484569.0, + "step": 600 + }, + { + "entropy": 0.5170091751217842, + "epoch": 1.7389558232931726, + "grad_norm": 0.2679271996021271, + "learning_rate": 0.00020606095213739626, + "loss": 0.4867116165161133, + "mean_token_accuracy": 0.8584025889635086, + "num_tokens": 1609962.0, + "step": 650 + }, + { + "entropy": 0.519580851495266, + "epoch": 1.8728246318607764, + "grad_norm": 0.2783832848072052, + "learning_rate": 0.0002047102123421885, + "loss": 0.4858899688720703, + "mean_token_accuracy": 0.8601382756233216, + "num_tokens": 1729667.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.5516054129600525, + "eval_loss": 0.5468233227729797, + "eval_mean_token_accuracy": 0.8418151989579201, + "eval_num_tokens": 1840874.0, + "eval_runtime": 49.0401, + "eval_samples_per_second": 32.606, + "eval_steps_per_second": 4.078, + "step": 748 + }, + { + "entropy": 0.5179645954960524, + "epoch": 2.005354752342704, + "grad_norm": 0.39848020672798157, + "learning_rate": 0.00020314181351077757, + "loss": 0.4828613662719727, + "mean_token_accuracy": 0.8594950991447525, + "num_tokens": 1846410.0, + "step": 750 + }, + { + "entropy": 0.4724735128879547, + "epoch": 2.139223560910308, + "grad_norm": 0.27450788021087646, + "learning_rate": 0.00020135917063148916, + "loss": 0.43275066375732424, + "mean_token_accuracy": 0.8704854369163513, + "num_tokens": 1971547.0, + "step": 800 + }, + { + "entropy": 0.4807534040510654, + "epoch": 2.2730923694779115, + "grad_norm": 0.3146534562110901, + "learning_rate": 0.00019936616518172531, + "loss": 0.44326435089111327, + "mean_token_accuracy": 0.8670724099874496, + "num_tokens": 2088560.0, + "step": 850 + }, + { + "entropy": 0.4744683504104614, + "epoch": 2.4069611780455156, + "grad_norm": 0.3151724338531494, + "learning_rate": 0.0001971671366765428, + "loss": 0.44036914825439455, + "mean_token_accuracy": 0.8687405550479889, + "num_tokens": 2210146.0, + "step": 900 + }, + { + "entropy": 0.4728820985555649, + "epoch": 2.540829986613119, + "grad_norm": 0.30318814516067505, + "learning_rate": 0.00019476687321991266, + "loss": 0.43707496643066407, + "mean_token_accuracy": 0.8694235664606095, + "num_tokens": 2331354.0, + "step": 950 + }, + { + "entropy": 0.4706392896175384, + "epoch": 2.674698795180723, + "grad_norm": 0.32992425560951233, + "learning_rate": 0.00019217060107923494, + "loss": 0.4379159545898437, + "mean_token_accuracy": 0.8695945852994919, + "num_tokens": 2452558.0, + "step": 1000 + }, + { + "entropy": 0.46944593608379365, + "epoch": 2.8085676037483265, + "grad_norm": 0.23659372329711914, + "learning_rate": 0.0001893839733058082, + "loss": 0.43395462036132815, + "mean_token_accuracy": 0.8707112389802932, + "num_tokens": 2582428.0, + "step": 1050 + }, + { + "entropy": 0.47586817651987073, + "epoch": 2.9424364123159306, + "grad_norm": 0.25799325108528137, + "learning_rate": 0.00018641305742603172, + "loss": 0.44020862579345704, + "mean_token_accuracy": 0.869452143907547, + "num_tokens": 2707532.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.5092438699305057, + "eval_loss": 0.5377861857414246, + "eval_mean_token_accuracy": 0.8451382353901863, + "eval_num_tokens": 2761311.0, + "eval_runtime": 49.0653, + "eval_samples_per_second": 32.589, + "eval_steps_per_second": 4.076, + "step": 1122 + }, + { + "entropy": 0.44004572521556506, + "epoch": 3.074966532797858, + "grad_norm": 0.31287693977355957, + "learning_rate": 0.0001832643222301409, + "loss": 0.39778636932373046, + "mean_token_accuracy": 0.8794932232962714, + "num_tokens": 2834344.0, + "step": 1150 + }, + { + "entropy": 0.42112498462200165, + "epoch": 3.208835341365462, + "grad_norm": 0.3411082327365875, + "learning_rate": 0.000179944623687242, + "loss": 0.3773982620239258, + "mean_token_accuracy": 0.8827895969152451, + "num_tokens": 2952748.0, + "step": 1200 + }, + { + "entropy": 0.41285495966672897, + "epoch": 3.3427041499330654, + "grad_norm": 0.31819090247154236, + "learning_rate": 0.0001764611900173143, + "loss": 0.3741728210449219, + "mean_token_accuracy": 0.8846501314640045, + "num_tokens": 3080333.0, + "step": 1250 + }, + { + "entropy": 0.42110098838806154, + "epoch": 3.4765729585006695, + "grad_norm": 0.3009640872478485, + "learning_rate": 0.00017282160595268327, + "loss": 0.3816569900512695, + "mean_token_accuracy": 0.8814844501018524, + "num_tokens": 3206920.0, + "step": 1300 + }, + { + "entropy": 0.4269171151518822, + "epoch": 3.610441767068273, + "grad_norm": 0.3651696741580963, + "learning_rate": 0.00016903379622323396, + "loss": 0.38763641357421874, + "mean_token_accuracy": 0.8813142603635788, + "num_tokens": 3327931.0, + "step": 1350 + }, + { + "entropy": 0.4262796178460121, + "epoch": 3.7443105756358768, + "grad_norm": 0.28190597891807556, + "learning_rate": 0.00016510600830132272, + "loss": 0.38563640594482423, + "mean_token_accuracy": 0.8813980734348297, + "num_tokens": 3443874.0, + "step": 1400 + }, + { + "entropy": 0.4200029063224793, + "epoch": 3.878179384203481, + "grad_norm": 0.2584955096244812, + "learning_rate": 0.00016104679444395854, + "loss": 0.3829658508300781, + "mean_token_accuracy": 0.8822885012626648, + "num_tokens": 3573158.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.47917599841952324, + "eval_loss": 0.5421923398971558, + "eval_mean_token_accuracy": 0.8468858799338341, + "eval_num_tokens": 3681748.0, + "eval_runtime": 49.0466, + "eval_samples_per_second": 32.602, + "eval_steps_per_second": 4.078, + "step": 1496 + }, + { + "entropy": 0.42061784803265273, + "epoch": 4.010709504685408, + "grad_norm": 0.3225401043891907, + "learning_rate": 0.0001568649930713548, + "loss": 0.38076282501220704, + "mean_token_accuracy": 0.8825460594109814, + "num_tokens": 3691654.0, + "step": 1500 + }, + { + "entropy": 0.3564541311562061, + "epoch": 4.144578313253012, + "grad_norm": 0.42701128125190735, + "learning_rate": 0.00015256970952239702, + "loss": 0.3080678176879883, + "mean_token_accuracy": 0.9021131205558777, + "num_tokens": 3813863.0, + "step": 1550 + }, + { + "entropy": 0.3623583456873894, + "epoch": 4.278447121820616, + "grad_norm": 0.7542179822921753, + "learning_rate": 0.00014817029622892904, + "loss": 0.31919103622436523, + "mean_token_accuracy": 0.8978805804252624, + "num_tokens": 3942009.0, + "step": 1600 + }, + { + "entropy": 0.35793897867202756, + "epoch": 4.412315930388219, + "grad_norm": 0.3683936595916748, + "learning_rate": 0.0001436763323520266, + "loss": 0.31606245040893555, + "mean_token_accuracy": 0.8989632934331894, + "num_tokens": 4067146.0, + "step": 1650 + }, + { + "entropy": 0.36605307310819624, + "epoch": 4.546184738955823, + "grad_norm": 0.3938419222831726, + "learning_rate": 0.00013909760292459586, + "loss": 0.3214926528930664, + "mean_token_accuracy": 0.897950147986412, + "num_tokens": 4186586.0, + "step": 1700 + }, + { + "entropy": 0.3702411252260208, + "epoch": 4.680053547523427, + "grad_norm": 0.3159140646457672, + "learning_rate": 0.0001344440775457131, + "loss": 0.32606857299804687, + "mean_token_accuracy": 0.8971680045127869, + "num_tokens": 4307001.0, + "step": 1750 + }, + { + "entropy": 0.3695961621403694, + "epoch": 4.813922356091031, + "grad_norm": 0.43663182854652405, + "learning_rate": 0.00012972588867309488, + "loss": 0.324642448425293, + "mean_token_accuracy": 0.8974496972560883, + "num_tokens": 4429513.0, + "step": 1800 + }, + { + "entropy": 0.3658722630143166, + "epoch": 4.947791164658635, + "grad_norm": 0.3189752995967865, + "learning_rate": 0.0001249533095609642, + "loss": 0.3198036575317383, + "mean_token_accuracy": 0.8985732847452164, + "num_tokens": 4557562.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.43428498685359956, + "eval_loss": 0.5698739290237427, + "eval_mean_token_accuracy": 0.8450208070874214, + "eval_num_tokens": 4602185.0, + "eval_runtime": 49.0399, + "eval_samples_per_second": 32.606, + "eval_steps_per_second": 4.078, + "step": 1870 + }, + { + "entropy": 0.3230800422454121, + "epoch": 5.080321285140562, + "grad_norm": 0.38153186440467834, + "learning_rate": 0.00012013673189135029, + "loss": 0.2727243995666504, + "mean_token_accuracy": 0.9120446022110756, + "num_tokens": 4679483.0, + "step": 1900 + }, + { + "entropy": 0.2979305517673492, + "epoch": 5.214190093708166, + "grad_norm": 0.42468664050102234, + "learning_rate": 0.00011528664314752708, + "loss": 0.24437490463256836, + "mean_token_accuracy": 0.9198145979642868, + "num_tokens": 4800131.0, + "step": 1950 + }, + { + "entropy": 0.29811844661831854, + "epoch": 5.34805890227577, + "grad_norm": 0.48722052574157715, + "learning_rate": 0.0001104136037788565, + "loss": 0.2472528076171875, + "mean_token_accuracy": 0.9198214167356491, + "num_tokens": 4923382.0, + "step": 2000 + }, + { + "entropy": 0.3036586672067642, + "epoch": 5.481927710843373, + "grad_norm": 0.4003150165081024, + "learning_rate": 0.00010552822420675757, + "loss": 0.2524623489379883, + "mean_token_accuracy": 0.9182902538776397, + "num_tokens": 5042200.0, + "step": 2050 + }, + { + "entropy": 0.30276541873812673, + "epoch": 5.615796519410977, + "grad_norm": 0.5082385540008545, + "learning_rate": 0.00010064114172186765, + "loss": 0.2554252052307129, + "mean_token_accuracy": 0.9163929194211959, + "num_tokens": 5168285.0, + "step": 2100 + }, + { + "entropy": 0.30480867981910703, + "epoch": 5.749665327978581, + "grad_norm": 0.46462953090667725, + "learning_rate": 9.57629973226994e-05, + "loss": 0.25483154296875, + "mean_token_accuracy": 0.916156811118126, + "num_tokens": 5289880.0, + "step": 2150 + }, + { + "entropy": 0.2954915864765644, + "epoch": 5.883534136546185, + "grad_norm": 0.5345046520233154, + "learning_rate": 9.090441254622432e-05, + "loss": 0.24575115203857423, + "mean_token_accuracy": 0.9198049437999726, + "num_tokens": 5413325.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.37039283126592637, + "eval_loss": 0.6287115812301636, + "eval_mean_token_accuracy": 0.8435265091061592, + "eval_num_tokens": 5522622.0, + "eval_runtime": 49.0873, + "eval_samples_per_second": 32.575, + "eval_steps_per_second": 4.074, + "step": 2244 + }, + { + "entropy": 0.2953245359839815, + "epoch": 6.016064257028113, + "grad_norm": 0.4055909216403961, + "learning_rate": 8.607596634083136e-05, + "loss": 0.24116868972778321, + "mean_token_accuracy": 0.9220171468426482, + "num_tokens": 5536511.0, + "step": 2250 + }, + { + "entropy": 0.229744790494442, + "epoch": 6.149933065595716, + "grad_norm": 0.4221028983592987, + "learning_rate": 8.128817203201665e-05, + "loss": 0.1732115364074707, + "mean_token_accuracy": 0.9427249735593796, + "num_tokens": 5663054.0, + "step": 2300 + }, + { + "entropy": 0.23100735485553742, + "epoch": 6.28380187416332, + "grad_norm": 0.5276848673820496, + "learning_rate": 7.655145443095877e-05, + "loss": 0.1742458724975586, + "mean_token_accuracy": 0.9424393928050995, + "num_tokens": 5790379.0, + "step": 2350 + }, + { + "entropy": 0.2334547135233879, + "epoch": 6.417670682730924, + "grad_norm": 0.5443829298019409, + "learning_rate": 7.187612713582257e-05, + "loss": 0.17723684310913085, + "mean_token_accuracy": 0.9421556174755097, + "num_tokens": 5908852.0, + "step": 2400 + }, + { + "entropy": 0.22672353580594062, + "epoch": 6.551539491298527, + "grad_norm": 0.4869251251220703, + "learning_rate": 6.727237007521524e-05, + "loss": 0.17469547271728517, + "mean_token_accuracy": 0.9419155931472778, + "num_tokens": 6033966.0, + "step": 2450 + }, + { + "entropy": 0.23190354615449904, + "epoch": 6.685408299866131, + "grad_norm": 0.5983045697212219, + "learning_rate": 6.275020734269083e-05, + "loss": 0.17733327865600587, + "mean_token_accuracy": 0.9419048410654068, + "num_tokens": 6156278.0, + "step": 2500 + }, + { + "entropy": 0.23502119958400727, + "epoch": 6.8192771084337345, + "grad_norm": 0.5186291337013245, + "learning_rate": 5.831948537056545e-05, + "loss": 0.18074512481689453, + "mean_token_accuracy": 0.9402925485372543, + "num_tokens": 6276774.0, + "step": 2550 + }, + { + "entropy": 0.23055340006947517, + "epoch": 6.953145917001339, + "grad_norm": 0.48175379633903503, + "learning_rate": 5.3989851490567374e-05, + "loss": 0.17573400497436523, + "mean_token_accuracy": 0.9420478469133378, + "num_tokens": 6401444.0, + "step": 2600 + }, + { + "epoch": 7.0, + "eval_entropy": 0.3201144814491272, + "eval_loss": 0.7185283899307251, + "eval_mean_token_accuracy": 0.8399944826960564, + "eval_num_tokens": 6443059.0, + "eval_runtime": 49.0868, + "eval_samples_per_second": 32.575, + "eval_steps_per_second": 4.074, + "step": 2618 + }, + { + "entropy": 0.19884085278920452, + "epoch": 7.085676037483267, + "grad_norm": 0.4999229311943054, + "learning_rate": 4.977073292800337e-05, + "loss": 0.13776198387145996, + "mean_token_accuracy": 0.95504442369095, + "num_tokens": 6525142.0, + "step": 2650 + }, + { + "entropy": 0.17382358580827714, + "epoch": 7.21954484605087, + "grad_norm": 0.44681516289711, + "learning_rate": 4.567131627517827e-05, + "loss": 0.1151345157623291, + "mean_token_accuracy": 0.9624496775865555, + "num_tokens": 6651930.0, + "step": 2700 + }, + { + "entropy": 0.17993666499853134, + "epoch": 7.353413654618474, + "grad_norm": 0.47544270753860474, + "learning_rate": 4.1700527488762594e-05, + "loss": 0.12008686065673828, + "mean_token_accuracy": 0.9607802790403366, + "num_tokens": 6768469.0, + "step": 2750 + }, + { + "entropy": 0.17229609042406083, + "epoch": 7.4872824631860775, + "grad_norm": 0.4898432791233063, + "learning_rate": 3.786701245466089e-05, + "loss": 0.1164663314819336, + "mean_token_accuracy": 0.9622354304790497, + "num_tokens": 6892532.0, + "step": 2800 + }, + { + "entropy": 0.16839693702757358, + "epoch": 7.621151271753681, + "grad_norm": 0.5927155613899231, + "learning_rate": 3.417911816269838e-05, + "loss": 0.1138334846496582, + "mean_token_accuracy": 0.9632772338390351, + "num_tokens": 7023373.0, + "step": 2850 + }, + { + "entropy": 0.173521406725049, + "epoch": 7.755020080321285, + "grad_norm": 0.5407077670097351, + "learning_rate": 3.0644874532115575e-05, + "loss": 0.11670659065246582, + "mean_token_accuracy": 0.9622769457101822, + "num_tokens": 7146448.0, + "step": 2900 + }, + { + "entropy": 0.17213394075632096, + "epoch": 7.888888888888889, + "grad_norm": 0.48647794127464294, + "learning_rate": 2.727197692744389e-05, + "loss": 0.11715221405029297, + "mean_token_accuracy": 0.9625237709283829, + "num_tokens": 7267680.0, + "step": 2950 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2757616487890482, + "eval_loss": 0.8189995884895325, + "eval_mean_token_accuracy": 0.8390460336208343, + "eval_num_tokens": 7363496.0, + "eval_runtime": 49.0999, + "eval_samples_per_second": 32.566, + "eval_steps_per_second": 4.073, + "step": 2992 + }, + { + "entropy": 0.17219420319253748, + "epoch": 8.021419009370817, + "grad_norm": 0.37176114320755005, + "learning_rate": 2.406776940283137e-05, + "loss": 0.11532307624816894, + "mean_token_accuracy": 0.9632835845754604, + "num_tokens": 7384150.0, + "step": 3000 + }, + { + "entropy": 0.14740404956042766, + "epoch": 8.15528781793842, + "grad_norm": 0.3686515688896179, + "learning_rate": 2.10392287113017e-05, + "loss": 0.08609914779663086, + "mean_token_accuracy": 0.9740558165311813, + "num_tokens": 7502454.0, + "step": 3050 + }, + { + "entropy": 0.14145817942917346, + "epoch": 8.289156626506024, + "grad_norm": 0.3443886339664459, + "learning_rate": 1.8192949113764877e-05, + "loss": 0.081221923828125, + "mean_token_accuracy": 0.9744218772649765, + "num_tokens": 7628419.0, + "step": 3100 + }, + { + "entropy": 0.13880868263542653, + "epoch": 8.423025435073628, + "grad_norm": 0.4160684049129486, + "learning_rate": 1.5535128020855533e-05, + "loss": 0.0840027904510498, + "mean_token_accuracy": 0.9742409408092498, + "num_tokens": 7751912.0, + "step": 3150 + }, + { + "entropy": 0.13735023334622384, + "epoch": 8.556894243641231, + "grad_norm": 0.5086039900779724, + "learning_rate": 1.3071552498861985e-05, + "loss": 0.08229084014892578, + "mean_token_accuracy": 0.9739874929189682, + "num_tokens": 7877804.0, + "step": 3200 + }, + { + "entropy": 0.1375646834075451, + "epoch": 8.690763052208835, + "grad_norm": 0.32038062810897827, + "learning_rate": 1.0807586669127857e-05, + "loss": 0.08256589889526367, + "mean_token_accuracy": 0.9740087121725083, + "num_tokens": 8003296.0, + "step": 3250 + }, + { + "entropy": 0.14328225292265415, + "epoch": 8.824631860776439, + "grad_norm": 0.34184616804122925, + "learning_rate": 8.748160028362413e-06, + "loss": 0.08445584297180175, + "mean_token_accuracy": 0.9736961781978607, + "num_tokens": 8123859.0, + "step": 3300 + }, + { + "entropy": 0.14006814867258072, + "epoch": 8.958500669344042, + "grad_norm": 0.610028088092804, + "learning_rate": 6.897756715290319e-06, + "loss": 0.08359557151794433, + "mean_token_accuracy": 0.9739799553155899, + "num_tokens": 8246971.0, + "step": 3350 + }, + { + "epoch": 9.0, + "eval_entropy": 0.2503813248872757, + "eval_loss": 0.9325668215751648, + "eval_mean_token_accuracy": 0.8365286010503769, + "eval_num_tokens": 8283933.0, + "eval_runtime": 49.0602, + "eval_samples_per_second": 32.593, + "eval_steps_per_second": 4.077, + "step": 3366 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.344062788520479e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..62108c4c8ad6a925598383a7b5a5345aa9b946e6 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0652985372477836, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj", + "up_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5366ce377642689b236590ff16365897360188f2 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/trainer_state.json @@ -0,0 +1,115 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 374, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.8254910862445832, + "epoch": 0.13386880856760375, + "grad_norm": 0.8958511352539062, + "learning_rate": 2.7446846603309888e-05, + "loss": 1.722928009033203, + "mean_token_accuracy": 0.6557231456041336, + "num_tokens": 122643.0, + "step": 50 + }, + { + "entropy": 0.8272530055046081, + "epoch": 0.2677376171352075, + "grad_norm": 0.6162250638008118, + "learning_rate": 5.5453832933217935e-05, + "loss": 0.7761137390136719, + "mean_token_accuracy": 0.795179500579834, + "num_tokens": 245702.0, + "step": 100 + }, + { + "entropy": 0.6629777508974075, + "epoch": 0.40160642570281124, + "grad_norm": 0.43394413590431213, + "learning_rate": 8.346081926312598e-05, + "loss": 0.6302793884277343, + "mean_token_accuracy": 0.8262274640798569, + "num_tokens": 371834.0, + "step": 150 + }, + { + "entropy": 0.6184763962030411, + "epoch": 0.535475234270415, + "grad_norm": 0.4373406767845154, + "learning_rate": 0.00011146780559303404, + "loss": 0.5863075256347656, + "mean_token_accuracy": 0.8357332807779312, + "num_tokens": 500948.0, + "step": 200 + }, + { + "entropy": 0.6116772794723511, + "epoch": 0.6693440428380187, + "grad_norm": 0.6781982183456421, + "learning_rate": 0.00013947479192294207, + "loss": 0.5783074951171875, + "mean_token_accuracy": 0.8368694090843201, + "num_tokens": 621874.0, + "step": 250 + }, + { + "entropy": 0.5848374783992767, + "epoch": 0.8032128514056225, + "grad_norm": 0.3493351340293884, + "learning_rate": 0.00016748177825285014, + "loss": 0.5531037902832031, + "mean_token_accuracy": 0.8434528934955597, + "num_tokens": 746138.0, + "step": 300 + }, + { + "entropy": 0.57043960750103, + "epoch": 0.9370816599732262, + "grad_norm": 0.3953551650047302, + "learning_rate": 0.00019548876458275817, + "loss": 0.5387083053588867, + "mean_token_accuracy": 0.844996885061264, + "num_tokens": 868331.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.6065685012936592, + "eval_loss": 0.5825985074043274, + "eval_mean_token_accuracy": 0.8326057174801826, + "eval_num_tokens": 920437.0, + "eval_runtime": 49.017, + "eval_samples_per_second": 32.621, + "eval_steps_per_second": 4.08, + "step": 374 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.800445791636787e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..62108c4c8ad6a925598383a7b5a5345aa9b946e6 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0652985372477836, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj", + "up_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e0b077016f64f0c0638281a1f528441069be96a3 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/trainer_state.json @@ -0,0 +1,884 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 500, + "global_step": 3740, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.8254910862445832, + "epoch": 0.13386880856760375, + "grad_norm": 0.8958511352539062, + "learning_rate": 2.7446846603309888e-05, + "loss": 1.722928009033203, + "mean_token_accuracy": 0.6557231456041336, + "num_tokens": 122643.0, + "step": 50 + }, + { + "entropy": 0.8272530055046081, + "epoch": 0.2677376171352075, + "grad_norm": 0.6162250638008118, + "learning_rate": 5.5453832933217935e-05, + "loss": 0.7761137390136719, + "mean_token_accuracy": 0.795179500579834, + "num_tokens": 245702.0, + "step": 100 + }, + { + "entropy": 0.6629777508974075, + "epoch": 0.40160642570281124, + "grad_norm": 0.43394413590431213, + "learning_rate": 8.346081926312598e-05, + "loss": 0.6302793884277343, + "mean_token_accuracy": 0.8262274640798569, + "num_tokens": 371834.0, + "step": 150 + }, + { + "entropy": 0.6184763962030411, + "epoch": 0.535475234270415, + "grad_norm": 0.4373406767845154, + "learning_rate": 0.00011146780559303404, + "loss": 0.5863075256347656, + "mean_token_accuracy": 0.8357332807779312, + "num_tokens": 500948.0, + "step": 200 + }, + { + "entropy": 0.6116772794723511, + "epoch": 0.6693440428380187, + "grad_norm": 0.6781982183456421, + "learning_rate": 0.00013947479192294207, + "loss": 0.5783074951171875, + "mean_token_accuracy": 0.8368694090843201, + "num_tokens": 621874.0, + "step": 250 + }, + { + "entropy": 0.5848374783992767, + "epoch": 0.8032128514056225, + "grad_norm": 0.3493351340293884, + "learning_rate": 0.00016748177825285014, + "loss": 0.5531037902832031, + "mean_token_accuracy": 0.8434528934955597, + "num_tokens": 746138.0, + "step": 300 + }, + { + "entropy": 0.57043960750103, + "epoch": 0.9370816599732262, + "grad_norm": 0.3953551650047302, + "learning_rate": 0.00019548876458275817, + "loss": 0.5387083053588867, + "mean_token_accuracy": 0.844996885061264, + "num_tokens": 868331.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.6065685012936592, + "eval_loss": 0.5825985074043274, + "eval_mean_token_accuracy": 0.8326057174801826, + "eval_num_tokens": 920437.0, + "eval_runtime": 49.017, + "eval_samples_per_second": 32.621, + "eval_steps_per_second": 4.08, + "step": 374 + }, + { + "entropy": 0.5623676343397661, + "epoch": 1.069611780455154, + "grad_norm": 0.37431156635284424, + "learning_rate": 0.00020946374495076317, + "loss": 0.526231803894043, + "mean_token_accuracy": 0.8500469212580208, + "num_tokens": 980838.0, + "step": 400 + }, + { + "entropy": 0.5420066699385643, + "epoch": 1.2034805890227578, + "grad_norm": 0.3302382826805115, + "learning_rate": 0.00020923573570386192, + "loss": 0.5088261032104492, + "mean_token_accuracy": 0.8518517130613327, + "num_tokens": 1104763.0, + "step": 450 + }, + { + "entropy": 0.5347033357620239, + "epoch": 1.3373493975903614, + "grad_norm": 0.281392902135849, + "learning_rate": 0.00020878021367110025, + "loss": 0.5044374084472656, + "mean_token_accuracy": 0.8548643559217453, + "num_tokens": 1230459.0, + "step": 500 + }, + { + "entropy": 0.5319472518563271, + "epoch": 1.4712182061579653, + "grad_norm": 0.3298404812812805, + "learning_rate": 0.00020809817069357935, + "loss": 0.5011252593994141, + "mean_token_accuracy": 0.8546603417396545, + "num_tokens": 1356368.0, + "step": 550 + }, + { + "entropy": 0.5234624195098877, + "epoch": 1.605087014725569, + "grad_norm": 0.26496022939682007, + "learning_rate": 0.00020719109183285305, + "loss": 0.49288436889648435, + "mean_token_accuracy": 0.8559961414337158, + "num_tokens": 1484569.0, + "step": 600 + }, + { + "entropy": 0.5170091751217842, + "epoch": 1.7389558232931726, + "grad_norm": 0.2679271996021271, + "learning_rate": 0.00020606095213739626, + "loss": 0.4867116165161133, + "mean_token_accuracy": 0.8584025889635086, + "num_tokens": 1609962.0, + "step": 650 + }, + { + "entropy": 0.519580851495266, + "epoch": 1.8728246318607764, + "grad_norm": 0.2783832848072052, + "learning_rate": 0.0002047102123421885, + "loss": 0.4858899688720703, + "mean_token_accuracy": 0.8601382756233216, + "num_tokens": 1729667.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.5516054129600525, + "eval_loss": 0.5468233227729797, + "eval_mean_token_accuracy": 0.8418151989579201, + "eval_num_tokens": 1840874.0, + "eval_runtime": 49.0401, + "eval_samples_per_second": 32.606, + "eval_steps_per_second": 4.078, + "step": 748 + }, + { + "entropy": 0.5179645954960524, + "epoch": 2.005354752342704, + "grad_norm": 0.39848020672798157, + "learning_rate": 0.00020314181351077757, + "loss": 0.4828613662719727, + "mean_token_accuracy": 0.8594950991447525, + "num_tokens": 1846410.0, + "step": 750 + }, + { + "entropy": 0.4724735128879547, + "epoch": 2.139223560910308, + "grad_norm": 0.27450788021087646, + "learning_rate": 0.00020135917063148916, + "loss": 0.43275066375732424, + "mean_token_accuracy": 0.8704854369163513, + "num_tokens": 1971547.0, + "step": 800 + }, + { + "entropy": 0.4807534040510654, + "epoch": 2.2730923694779115, + "grad_norm": 0.3146534562110901, + "learning_rate": 0.00019936616518172531, + "loss": 0.44326435089111327, + "mean_token_accuracy": 0.8670724099874496, + "num_tokens": 2088560.0, + "step": 850 + }, + { + "entropy": 0.4744683504104614, + "epoch": 2.4069611780455156, + "grad_norm": 0.3151724338531494, + "learning_rate": 0.0001971671366765428, + "loss": 0.44036914825439455, + "mean_token_accuracy": 0.8687405550479889, + "num_tokens": 2210146.0, + "step": 900 + }, + { + "entropy": 0.4728820985555649, + "epoch": 2.540829986613119, + "grad_norm": 0.30318814516067505, + "learning_rate": 0.00019476687321991266, + "loss": 0.43707496643066407, + "mean_token_accuracy": 0.8694235664606095, + "num_tokens": 2331354.0, + "step": 950 + }, + { + "entropy": 0.4706392896175384, + "epoch": 2.674698795180723, + "grad_norm": 0.32992425560951233, + "learning_rate": 0.00019217060107923494, + "loss": 0.4379159545898437, + "mean_token_accuracy": 0.8695945852994919, + "num_tokens": 2452558.0, + "step": 1000 + }, + { + "entropy": 0.46944593608379365, + "epoch": 2.8085676037483265, + "grad_norm": 0.23659372329711914, + "learning_rate": 0.0001893839733058082, + "loss": 0.43395462036132815, + "mean_token_accuracy": 0.8707112389802932, + "num_tokens": 2582428.0, + "step": 1050 + }, + { + "entropy": 0.47586817651987073, + "epoch": 2.9424364123159306, + "grad_norm": 0.25799325108528137, + "learning_rate": 0.00018641305742603172, + "loss": 0.44020862579345704, + "mean_token_accuracy": 0.869452143907547, + "num_tokens": 2707532.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.5092438699305057, + "eval_loss": 0.5377861857414246, + "eval_mean_token_accuracy": 0.8451382353901863, + "eval_num_tokens": 2761311.0, + "eval_runtime": 49.0653, + "eval_samples_per_second": 32.589, + "eval_steps_per_second": 4.076, + "step": 1122 + }, + { + "entropy": 0.44004572521556506, + "epoch": 3.074966532797858, + "grad_norm": 0.31287693977355957, + "learning_rate": 0.0001832643222301409, + "loss": 0.39778636932373046, + "mean_token_accuracy": 0.8794932232962714, + "num_tokens": 2834344.0, + "step": 1150 + }, + { + "entropy": 0.42112498462200165, + "epoch": 3.208835341365462, + "grad_norm": 0.3411082327365875, + "learning_rate": 0.000179944623687242, + "loss": 0.3773982620239258, + "mean_token_accuracy": 0.8827895969152451, + "num_tokens": 2952748.0, + "step": 1200 + }, + { + "entropy": 0.41285495966672897, + "epoch": 3.3427041499330654, + "grad_norm": 0.31819090247154236, + "learning_rate": 0.0001764611900173143, + "loss": 0.3741728210449219, + "mean_token_accuracy": 0.8846501314640045, + "num_tokens": 3080333.0, + "step": 1250 + }, + { + "entropy": 0.42110098838806154, + "epoch": 3.4765729585006695, + "grad_norm": 0.3009640872478485, + "learning_rate": 0.00017282160595268327, + "loss": 0.3816569900512695, + "mean_token_accuracy": 0.8814844501018524, + "num_tokens": 3206920.0, + "step": 1300 + }, + { + "entropy": 0.4269171151518822, + "epoch": 3.610441767068273, + "grad_norm": 0.3651696741580963, + "learning_rate": 0.00016903379622323396, + "loss": 0.38763641357421874, + "mean_token_accuracy": 0.8813142603635788, + "num_tokens": 3327931.0, + "step": 1350 + }, + { + "entropy": 0.4262796178460121, + "epoch": 3.7443105756358768, + "grad_norm": 0.28190597891807556, + "learning_rate": 0.00016510600830132272, + "loss": 0.38563640594482423, + "mean_token_accuracy": 0.8813980734348297, + "num_tokens": 3443874.0, + "step": 1400 + }, + { + "entropy": 0.4200029063224793, + "epoch": 3.878179384203481, + "grad_norm": 0.2584955096244812, + "learning_rate": 0.00016104679444395854, + "loss": 0.3829658508300781, + "mean_token_accuracy": 0.8822885012626648, + "num_tokens": 3573158.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.47917599841952324, + "eval_loss": 0.5421923398971558, + "eval_mean_token_accuracy": 0.8468858799338341, + "eval_num_tokens": 3681748.0, + "eval_runtime": 49.0466, + "eval_samples_per_second": 32.602, + "eval_steps_per_second": 4.078, + "step": 1496 + }, + { + "entropy": 0.42061784803265273, + "epoch": 4.010709504685408, + "grad_norm": 0.3225401043891907, + "learning_rate": 0.0001568649930713548, + "loss": 0.38076282501220704, + "mean_token_accuracy": 0.8825460594109814, + "num_tokens": 3691654.0, + "step": 1500 + }, + { + "entropy": 0.3564541311562061, + "epoch": 4.144578313253012, + "grad_norm": 0.42701128125190735, + "learning_rate": 0.00015256970952239702, + "loss": 0.3080678176879883, + "mean_token_accuracy": 0.9021131205558777, + "num_tokens": 3813863.0, + "step": 1550 + }, + { + "entropy": 0.3623583456873894, + "epoch": 4.278447121820616, + "grad_norm": 0.7542179822921753, + "learning_rate": 0.00014817029622892904, + "loss": 0.31919103622436523, + "mean_token_accuracy": 0.8978805804252624, + "num_tokens": 3942009.0, + "step": 1600 + }, + { + "entropy": 0.35793897867202756, + "epoch": 4.412315930388219, + "grad_norm": 0.3683936595916748, + "learning_rate": 0.0001436763323520266, + "loss": 0.31606245040893555, + "mean_token_accuracy": 0.8989632934331894, + "num_tokens": 4067146.0, + "step": 1650 + }, + { + "entropy": 0.36605307310819624, + "epoch": 4.546184738955823, + "grad_norm": 0.3938419222831726, + "learning_rate": 0.00013909760292459586, + "loss": 0.3214926528930664, + "mean_token_accuracy": 0.897950147986412, + "num_tokens": 4186586.0, + "step": 1700 + }, + { + "entropy": 0.3702411252260208, + "epoch": 4.680053547523427, + "grad_norm": 0.3159140646457672, + "learning_rate": 0.0001344440775457131, + "loss": 0.32606857299804687, + "mean_token_accuracy": 0.8971680045127869, + "num_tokens": 4307001.0, + "step": 1750 + }, + { + "entropy": 0.3695961621403694, + "epoch": 4.813922356091031, + "grad_norm": 0.43663182854652405, + "learning_rate": 0.00012972588867309488, + "loss": 0.324642448425293, + "mean_token_accuracy": 0.8974496972560883, + "num_tokens": 4429513.0, + "step": 1800 + }, + { + "entropy": 0.3658722630143166, + "epoch": 4.947791164658635, + "grad_norm": 0.3189752995967865, + "learning_rate": 0.0001249533095609642, + "loss": 0.3198036575317383, + "mean_token_accuracy": 0.8985732847452164, + "num_tokens": 4557562.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.43428498685359956, + "eval_loss": 0.5698739290237427, + "eval_mean_token_accuracy": 0.8450208070874214, + "eval_num_tokens": 4602185.0, + "eval_runtime": 49.0399, + "eval_samples_per_second": 32.606, + "eval_steps_per_second": 4.078, + "step": 1870 + }, + { + "entropy": 0.3230800422454121, + "epoch": 5.080321285140562, + "grad_norm": 0.38153186440467834, + "learning_rate": 0.00012013673189135029, + "loss": 0.2727243995666504, + "mean_token_accuracy": 0.9120446022110756, + "num_tokens": 4679483.0, + "step": 1900 + }, + { + "entropy": 0.2979305517673492, + "epoch": 5.214190093708166, + "grad_norm": 0.42468664050102234, + "learning_rate": 0.00011528664314752708, + "loss": 0.24437490463256836, + "mean_token_accuracy": 0.9198145979642868, + "num_tokens": 4800131.0, + "step": 1950 + }, + { + "entropy": 0.29811844661831854, + "epoch": 5.34805890227577, + "grad_norm": 0.48722052574157715, + "learning_rate": 0.0001104136037788565, + "loss": 0.2472528076171875, + "mean_token_accuracy": 0.9198214167356491, + "num_tokens": 4923382.0, + "step": 2000 + }, + { + "entropy": 0.3036586672067642, + "epoch": 5.481927710843373, + "grad_norm": 0.4003150165081024, + "learning_rate": 0.00010552822420675757, + "loss": 0.2524623489379883, + "mean_token_accuracy": 0.9182902538776397, + "num_tokens": 5042200.0, + "step": 2050 + }, + { + "entropy": 0.30276541873812673, + "epoch": 5.615796519410977, + "grad_norm": 0.5082385540008545, + "learning_rate": 0.00010064114172186765, + "loss": 0.2554252052307129, + "mean_token_accuracy": 0.9163929194211959, + "num_tokens": 5168285.0, + "step": 2100 + }, + { + "entropy": 0.30480867981910703, + "epoch": 5.749665327978581, + "grad_norm": 0.46462953090667725, + "learning_rate": 9.57629973226994e-05, + "loss": 0.25483154296875, + "mean_token_accuracy": 0.916156811118126, + "num_tokens": 5289880.0, + "step": 2150 + }, + { + "entropy": 0.2954915864765644, + "epoch": 5.883534136546185, + "grad_norm": 0.5345046520233154, + "learning_rate": 9.090441254622432e-05, + "loss": 0.24575115203857423, + "mean_token_accuracy": 0.9198049437999726, + "num_tokens": 5413325.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.37039283126592637, + "eval_loss": 0.6287115812301636, + "eval_mean_token_accuracy": 0.8435265091061592, + "eval_num_tokens": 5522622.0, + "eval_runtime": 49.0873, + "eval_samples_per_second": 32.575, + "eval_steps_per_second": 4.074, + "step": 2244 + }, + { + "entropy": 0.2953245359839815, + "epoch": 6.016064257028113, + "grad_norm": 0.4055909216403961, + "learning_rate": 8.607596634083136e-05, + "loss": 0.24116868972778321, + "mean_token_accuracy": 0.9220171468426482, + "num_tokens": 5536511.0, + "step": 2250 + }, + { + "entropy": 0.229744790494442, + "epoch": 6.149933065595716, + "grad_norm": 0.4221028983592987, + "learning_rate": 8.128817203201665e-05, + "loss": 0.1732115364074707, + "mean_token_accuracy": 0.9427249735593796, + "num_tokens": 5663054.0, + "step": 2300 + }, + { + "entropy": 0.23100735485553742, + "epoch": 6.28380187416332, + "grad_norm": 0.5276848673820496, + "learning_rate": 7.655145443095877e-05, + "loss": 0.1742458724975586, + "mean_token_accuracy": 0.9424393928050995, + "num_tokens": 5790379.0, + "step": 2350 + }, + { + "entropy": 0.2334547135233879, + "epoch": 6.417670682730924, + "grad_norm": 0.5443829298019409, + "learning_rate": 7.187612713582257e-05, + "loss": 0.17723684310913085, + "mean_token_accuracy": 0.9421556174755097, + "num_tokens": 5908852.0, + "step": 2400 + }, + { + "entropy": 0.22672353580594062, + "epoch": 6.551539491298527, + "grad_norm": 0.4869251251220703, + "learning_rate": 6.727237007521524e-05, + "loss": 0.17469547271728517, + "mean_token_accuracy": 0.9419155931472778, + "num_tokens": 6033966.0, + "step": 2450 + }, + { + "entropy": 0.23190354615449904, + "epoch": 6.685408299866131, + "grad_norm": 0.5983045697212219, + "learning_rate": 6.275020734269083e-05, + "loss": 0.17733327865600587, + "mean_token_accuracy": 0.9419048410654068, + "num_tokens": 6156278.0, + "step": 2500 + }, + { + "entropy": 0.23502119958400727, + "epoch": 6.8192771084337345, + "grad_norm": 0.5186291337013245, + "learning_rate": 5.831948537056545e-05, + "loss": 0.18074512481689453, + "mean_token_accuracy": 0.9402925485372543, + "num_tokens": 6276774.0, + "step": 2550 + }, + { + "entropy": 0.23055340006947517, + "epoch": 6.953145917001339, + "grad_norm": 0.48175379633903503, + "learning_rate": 5.3989851490567374e-05, + "loss": 0.17573400497436523, + "mean_token_accuracy": 0.9420478469133378, + "num_tokens": 6401444.0, + "step": 2600 + }, + { + "epoch": 7.0, + "eval_entropy": 0.3201144814491272, + "eval_loss": 0.7185283899307251, + "eval_mean_token_accuracy": 0.8399944826960564, + "eval_num_tokens": 6443059.0, + "eval_runtime": 49.0868, + "eval_samples_per_second": 32.575, + "eval_steps_per_second": 4.074, + "step": 2618 + }, + { + "entropy": 0.19884085278920452, + "epoch": 7.085676037483267, + "grad_norm": 0.4999229311943054, + "learning_rate": 4.977073292800337e-05, + "loss": 0.13776198387145996, + "mean_token_accuracy": 0.95504442369095, + "num_tokens": 6525142.0, + "step": 2650 + }, + { + "entropy": 0.17382358580827714, + "epoch": 7.21954484605087, + "grad_norm": 0.44681516289711, + "learning_rate": 4.567131627517827e-05, + "loss": 0.1151345157623291, + "mean_token_accuracy": 0.9624496775865555, + "num_tokens": 6651930.0, + "step": 2700 + }, + { + "entropy": 0.17993666499853134, + "epoch": 7.353413654618474, + "grad_norm": 0.47544270753860474, + "learning_rate": 4.1700527488762594e-05, + "loss": 0.12008686065673828, + "mean_token_accuracy": 0.9607802790403366, + "num_tokens": 6768469.0, + "step": 2750 + }, + { + "entropy": 0.17229609042406083, + "epoch": 7.4872824631860775, + "grad_norm": 0.4898432791233063, + "learning_rate": 3.786701245466089e-05, + "loss": 0.1164663314819336, + "mean_token_accuracy": 0.9622354304790497, + "num_tokens": 6892532.0, + "step": 2800 + }, + { + "entropy": 0.16839693702757358, + "epoch": 7.621151271753681, + "grad_norm": 0.5927155613899231, + "learning_rate": 3.417911816269838e-05, + "loss": 0.1138334846496582, + "mean_token_accuracy": 0.9632772338390351, + "num_tokens": 7023373.0, + "step": 2850 + }, + { + "entropy": 0.173521406725049, + "epoch": 7.755020080321285, + "grad_norm": 0.5407077670097351, + "learning_rate": 3.0644874532115575e-05, + "loss": 0.11670659065246582, + "mean_token_accuracy": 0.9622769457101822, + "num_tokens": 7146448.0, + "step": 2900 + }, + { + "entropy": 0.17213394075632096, + "epoch": 7.888888888888889, + "grad_norm": 0.48647794127464294, + "learning_rate": 2.727197692744389e-05, + "loss": 0.11715221405029297, + "mean_token_accuracy": 0.9625237709283829, + "num_tokens": 7267680.0, + "step": 2950 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2757616487890482, + "eval_loss": 0.8189995884895325, + "eval_mean_token_accuracy": 0.8390460336208343, + "eval_num_tokens": 7363496.0, + "eval_runtime": 49.0999, + "eval_samples_per_second": 32.566, + "eval_steps_per_second": 4.073, + "step": 2992 + }, + { + "entropy": 0.17219420319253748, + "epoch": 8.021419009370817, + "grad_norm": 0.37176114320755005, + "learning_rate": 2.406776940283137e-05, + "loss": 0.11532307624816894, + "mean_token_accuracy": 0.9632835845754604, + "num_tokens": 7384150.0, + "step": 3000 + }, + { + "entropy": 0.14740404956042766, + "epoch": 8.15528781793842, + "grad_norm": 0.3686515688896179, + "learning_rate": 2.10392287113017e-05, + "loss": 0.08609914779663086, + "mean_token_accuracy": 0.9740558165311813, + "num_tokens": 7502454.0, + "step": 3050 + }, + { + "entropy": 0.14145817942917346, + "epoch": 8.289156626506024, + "grad_norm": 0.3443886339664459, + "learning_rate": 1.8192949113764877e-05, + "loss": 0.081221923828125, + "mean_token_accuracy": 0.9744218772649765, + "num_tokens": 7628419.0, + "step": 3100 + }, + { + "entropy": 0.13880868263542653, + "epoch": 8.423025435073628, + "grad_norm": 0.4160684049129486, + "learning_rate": 1.5535128020855533e-05, + "loss": 0.0840027904510498, + "mean_token_accuracy": 0.9742409408092498, + "num_tokens": 7751912.0, + "step": 3150 + }, + { + "entropy": 0.13735023334622384, + "epoch": 8.556894243641231, + "grad_norm": 0.5086039900779724, + "learning_rate": 1.3071552498861985e-05, + "loss": 0.08229084014892578, + "mean_token_accuracy": 0.9739874929189682, + "num_tokens": 7877804.0, + "step": 3200 + }, + { + "entropy": 0.1375646834075451, + "epoch": 8.690763052208835, + "grad_norm": 0.32038062810897827, + "learning_rate": 1.0807586669127857e-05, + "loss": 0.08256589889526367, + "mean_token_accuracy": 0.9740087121725083, + "num_tokens": 8003296.0, + "step": 3250 + }, + { + "entropy": 0.14328225292265415, + "epoch": 8.824631860776439, + "grad_norm": 0.34184616804122925, + "learning_rate": 8.748160028362413e-06, + "loss": 0.08445584297180175, + "mean_token_accuracy": 0.9736961781978607, + "num_tokens": 8123859.0, + "step": 3300 + }, + { + "entropy": 0.14006814867258072, + "epoch": 8.958500669344042, + "grad_norm": 0.610028088092804, + "learning_rate": 6.897756715290319e-06, + "loss": 0.08359557151794433, + "mean_token_accuracy": 0.9739799553155899, + "num_tokens": 8246971.0, + "step": 3350 + }, + { + "epoch": 9.0, + "eval_entropy": 0.2503813248872757, + "eval_loss": 0.9325668215751648, + "eval_mean_token_accuracy": 0.8365286010503769, + "eval_num_tokens": 8283933.0, + "eval_runtime": 49.0602, + "eval_samples_per_second": 32.593, + "eval_steps_per_second": 4.077, + "step": 3366 + }, + { + "entropy": 0.13715504267902084, + "epoch": 9.09103078982597, + "grad_norm": 0.32644304633140564, + "learning_rate": 5.260405747011887e-06, + "loss": 0.0776783800125122, + "mean_token_accuracy": 0.9762971684186146, + "num_tokens": 8361698.0, + "step": 3400 + }, + { + "entropy": 0.13289645686745644, + "epoch": 9.224899598393574, + "grad_norm": 0.3593284785747528, + "learning_rate": 3.839672246332384e-06, + "loss": 0.07479411125183105, + "mean_token_accuracy": 0.977249429821968, + "num_tokens": 8479562.0, + "step": 3450 + }, + { + "entropy": 0.1306164874136448, + "epoch": 9.358768406961179, + "grad_norm": 0.3294520378112793, + "learning_rate": 2.6386496791621076e-06, + "loss": 0.07242131233215332, + "mean_token_accuracy": 0.9776324343681335, + "num_tokens": 8601271.0, + "step": 3500 + }, + { + "entropy": 0.12282529093325138, + "epoch": 9.492637215528783, + "grad_norm": 0.28185349702835083, + "learning_rate": 1.6599531188889682e-06, + "loss": 0.06905817031860352, + "mean_token_accuracy": 0.9788999700546265, + "num_tokens": 8730858.0, + "step": 3550 + }, + { + "entropy": 0.12793996281921863, + "epoch": 9.626506024096386, + "grad_norm": 0.39180248975753784, + "learning_rate": 9.057135523899838e-07, + "loss": 0.0722837495803833, + "mean_token_accuracy": 0.9780756998062133, + "num_tokens": 8855189.0, + "step": 3600 + }, + { + "entropy": 0.1260003688186407, + "epoch": 9.76037483266399, + "grad_norm": 0.354835569858551, + "learning_rate": 3.775732400792635e-07, + "loss": 0.07001883029937744, + "mean_token_accuracy": 0.9790224677324295, + "num_tokens": 8979510.0, + "step": 3650 + }, + { + "entropy": 0.12340887859463692, + "epoch": 9.894243641231594, + "grad_norm": 0.3675302267074585, + "learning_rate": 7.668214009545532e-08, + "loss": 0.06868332386016845, + "mean_token_accuracy": 0.9791393029689789, + "num_tokens": 9111678.0, + "step": 3700 + }, + { + "epoch": 10.0, + "eval_entropy": 0.24003039725124836, + "eval_loss": 0.9899837374687195, + "eval_mean_token_accuracy": 0.8358298748731613, + "eval_num_tokens": 9204370.0, + "eval_runtime": 49.1056, + "eval_samples_per_second": 32.562, + "eval_steps_per_second": 4.073, + "step": 3740 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.829323098596997e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..62108c4c8ad6a925598383a7b5a5345aa9b946e6 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0652985372477836, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj", + "up_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a1747272bf5443b8411defd3f7cee01d9dd5990a --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/trainer_state.json @@ -0,0 +1,196 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 748, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.8254910862445832, + "epoch": 0.13386880856760375, + "grad_norm": 0.8958511352539062, + "learning_rate": 2.7446846603309888e-05, + "loss": 1.722928009033203, + "mean_token_accuracy": 0.6557231456041336, + "num_tokens": 122643.0, + "step": 50 + }, + { + "entropy": 0.8272530055046081, + "epoch": 0.2677376171352075, + "grad_norm": 0.6162250638008118, + "learning_rate": 5.5453832933217935e-05, + "loss": 0.7761137390136719, + "mean_token_accuracy": 0.795179500579834, + "num_tokens": 245702.0, + "step": 100 + }, + { + "entropy": 0.6629777508974075, + "epoch": 0.40160642570281124, + "grad_norm": 0.43394413590431213, + "learning_rate": 8.346081926312598e-05, + "loss": 0.6302793884277343, + "mean_token_accuracy": 0.8262274640798569, + "num_tokens": 371834.0, + "step": 150 + }, + { + "entropy": 0.6184763962030411, + "epoch": 0.535475234270415, + "grad_norm": 0.4373406767845154, + "learning_rate": 0.00011146780559303404, + "loss": 0.5863075256347656, + "mean_token_accuracy": 0.8357332807779312, + "num_tokens": 500948.0, + "step": 200 + }, + { + "entropy": 0.6116772794723511, + "epoch": 0.6693440428380187, + "grad_norm": 0.6781982183456421, + "learning_rate": 0.00013947479192294207, + "loss": 0.5783074951171875, + "mean_token_accuracy": 0.8368694090843201, + "num_tokens": 621874.0, + "step": 250 + }, + { + "entropy": 0.5848374783992767, + "epoch": 0.8032128514056225, + "grad_norm": 0.3493351340293884, + "learning_rate": 0.00016748177825285014, + "loss": 0.5531037902832031, + "mean_token_accuracy": 0.8434528934955597, + "num_tokens": 746138.0, + "step": 300 + }, + { + "entropy": 0.57043960750103, + "epoch": 0.9370816599732262, + "grad_norm": 0.3953551650047302, + "learning_rate": 0.00019548876458275817, + "loss": 0.5387083053588867, + "mean_token_accuracy": 0.844996885061264, + "num_tokens": 868331.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.6065685012936592, + "eval_loss": 0.5825985074043274, + "eval_mean_token_accuracy": 0.8326057174801826, + "eval_num_tokens": 920437.0, + "eval_runtime": 49.017, + "eval_samples_per_second": 32.621, + "eval_steps_per_second": 4.08, + "step": 374 + }, + { + "entropy": 0.5623676343397661, + "epoch": 1.069611780455154, + "grad_norm": 0.37431156635284424, + "learning_rate": 0.00020946374495076317, + "loss": 0.526231803894043, + "mean_token_accuracy": 0.8500469212580208, + "num_tokens": 980838.0, + "step": 400 + }, + { + "entropy": 0.5420066699385643, + "epoch": 1.2034805890227578, + "grad_norm": 0.3302382826805115, + "learning_rate": 0.00020923573570386192, + "loss": 0.5088261032104492, + "mean_token_accuracy": 0.8518517130613327, + "num_tokens": 1104763.0, + "step": 450 + }, + { + "entropy": 0.5347033357620239, + "epoch": 1.3373493975903614, + "grad_norm": 0.281392902135849, + "learning_rate": 0.00020878021367110025, + "loss": 0.5044374084472656, + "mean_token_accuracy": 0.8548643559217453, + "num_tokens": 1230459.0, + "step": 500 + }, + { + "entropy": 0.5319472518563271, + "epoch": 1.4712182061579653, + "grad_norm": 0.3298404812812805, + "learning_rate": 0.00020809817069357935, + "loss": 0.5011252593994141, + "mean_token_accuracy": 0.8546603417396545, + "num_tokens": 1356368.0, + "step": 550 + }, + { + "entropy": 0.5234624195098877, + "epoch": 1.605087014725569, + "grad_norm": 0.26496022939682007, + "learning_rate": 0.00020719109183285305, + "loss": 0.49288436889648435, + "mean_token_accuracy": 0.8559961414337158, + "num_tokens": 1484569.0, + "step": 600 + }, + { + "entropy": 0.5170091751217842, + "epoch": 1.7389558232931726, + "grad_norm": 0.2679271996021271, + "learning_rate": 0.00020606095213739626, + "loss": 0.4867116165161133, + "mean_token_accuracy": 0.8584025889635086, + "num_tokens": 1609962.0, + "step": 650 + }, + { + "entropy": 0.519580851495266, + "epoch": 1.8728246318607764, + "grad_norm": 0.2783832848072052, + "learning_rate": 0.0002047102123421885, + "loss": 0.4858899688720703, + "mean_token_accuracy": 0.8601382756233216, + "num_tokens": 1729667.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.5516054129600525, + "eval_loss": 0.5468233227729797, + "eval_mean_token_accuracy": 0.8418151989579201, + "eval_num_tokens": 1840874.0, + "eval_runtime": 49.0401, + "eval_samples_per_second": 32.606, + "eval_steps_per_second": 4.078, + "step": 748 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.650338585790669e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..18d0a258ad2321bd2f63d9fd1f15f08b504f6542 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/README.md @@ -0,0 +1,58 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: transformers +model_name: Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2 +tags: +- generated_from_trainer +- trl +- sft +licence: license +--- + +# Model Card for Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2 + +This model is a fine-tuned version of [Qwen/Qwen3-4B-Base](https://huggingface.co/Qwen/Qwen3-4B-Base). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/katriin-kukk/Cross_lingual_morphological_generalization/runs/ed11jhv7) + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.29.0 +- Transformers: 5.5.4 +- Pytorch: 2.10.0 +- Datasets: 4.6.1 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a67d7b089767a1f6708064da5a07f65728eada3e --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_bias": false, + "lora_dropout": 0.0016857635936814886, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj", + "up_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..58f07f6c680b15779a532dc3b2429ba36fd336b7 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/trainer_state.json @@ -0,0 +1,287 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1122, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.3424121737480164, + "epoch": 0.13386880856760375, + "grad_norm": 1.1671011447906494, + "learning_rate": 2.7185774847148058e-05, + "loss": 1.2469227600097657, + "mean_token_accuracy": 0.7243366748094558, + "num_tokens": 122643.0, + "step": 50 + }, + { + "entropy": 0.6559184691309929, + "epoch": 0.2677376171352075, + "grad_norm": 0.8399614691734314, + "learning_rate": 5.4926361425870564e-05, + "loss": 0.6245294189453126, + "mean_token_accuracy": 0.8268856984376908, + "num_tokens": 245702.0, + "step": 100 + }, + { + "entropy": 0.6082554489374161, + "epoch": 0.40160642570281124, + "grad_norm": 0.5691282749176025, + "learning_rate": 8.266694800459306e-05, + "loss": 0.5804204177856446, + "mean_token_accuracy": 0.8374843555688858, + "num_tokens": 371834.0, + "step": 150 + }, + { + "entropy": 0.5805956655740738, + "epoch": 0.535475234270415, + "grad_norm": 0.45651012659072876, + "learning_rate": 0.00011040753458331558, + "loss": 0.5495451354980468, + "mean_token_accuracy": 0.8450068402290344, + "num_tokens": 500948.0, + "step": 200 + }, + { + "entropy": 0.5810796636343002, + "epoch": 0.6693440428380187, + "grad_norm": 0.493431955575943, + "learning_rate": 0.00013814812116203808, + "loss": 0.5546633911132812, + "mean_token_accuracy": 0.8431127589941024, + "num_tokens": 621874.0, + "step": 250 + }, + { + "entropy": 0.5683389616012573, + "epoch": 0.8032128514056225, + "grad_norm": 0.4698493182659149, + "learning_rate": 0.00016588870774076058, + "loss": 0.5354468536376953, + "mean_token_accuracy": 0.8477097982168198, + "num_tokens": 746138.0, + "step": 300 + }, + { + "entropy": 0.5579970148205757, + "epoch": 0.9370816599732262, + "grad_norm": 0.5637578964233398, + "learning_rate": 0.0001936292943194831, + "loss": 0.5262400054931641, + "mean_token_accuracy": 0.8487933957576752, + "num_tokens": 868331.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5953671643137932, + "eval_loss": 0.5836789608001709, + "eval_mean_token_accuracy": 0.8373425653576851, + "eval_num_tokens": 920437.0, + "eval_runtime": 49.8765, + "eval_samples_per_second": 32.039, + "eval_steps_per_second": 4.01, + "step": 374 + }, + { + "entropy": 0.5491663054986433, + "epoch": 1.069611780455154, + "grad_norm": 0.6051601767539978, + "learning_rate": 0.0002074713460228683, + "loss": 0.5132473373413086, + "mean_token_accuracy": 0.8526828770685677, + "num_tokens": 980838.0, + "step": 400 + }, + { + "entropy": 0.5334427100419998, + "epoch": 1.2034805890227578, + "grad_norm": 0.4304046630859375, + "learning_rate": 0.00020724550557791978, + "loss": 0.497388916015625, + "mean_token_accuracy": 0.8541187030076981, + "num_tokens": 1104763.0, + "step": 450 + }, + { + "entropy": 0.5297759872674942, + "epoch": 1.3373493975903614, + "grad_norm": 0.4243815541267395, + "learning_rate": 0.00020679431642677408, + "loss": 0.49724563598632815, + "mean_token_accuracy": 0.8557562667131424, + "num_tokens": 1230459.0, + "step": 500 + }, + { + "entropy": 0.5287212440371514, + "epoch": 1.4712182061579653, + "grad_norm": 0.5970085859298706, + "learning_rate": 0.0002061187609762355, + "loss": 0.4903334808349609, + "mean_token_accuracy": 0.8566980129480362, + "num_tokens": 1356368.0, + "step": 550 + }, + { + "entropy": 0.5196757692098618, + "epoch": 1.605087014725569, + "grad_norm": 0.353085458278656, + "learning_rate": 0.00020522031016209576, + "loss": 0.48056564331054685, + "mean_token_accuracy": 0.8591135066747665, + "num_tokens": 1484569.0, + "step": 600 + }, + { + "entropy": 0.5124905353784561, + "epoch": 1.7389558232931726, + "grad_norm": 0.383682519197464, + "learning_rate": 0.00020410092024635923, + "loss": 0.47599597930908205, + "mean_token_accuracy": 0.8606968414783478, + "num_tokens": 1609962.0, + "step": 650 + }, + { + "entropy": 0.5109778612852096, + "epoch": 1.8728246318607764, + "grad_norm": 0.35959598422050476, + "learning_rate": 0.00020276302855773176, + "loss": 0.47350929260253904, + "mean_token_accuracy": 0.862987876534462, + "num_tokens": 1729667.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.5806624293327332, + "eval_loss": 0.5672553181648254, + "eval_mean_token_accuracy": 0.8352122050523758, + "eval_num_tokens": 1840874.0, + "eval_runtime": 49.8669, + "eval_samples_per_second": 32.045, + "eval_steps_per_second": 4.011, + "step": 748 + }, + { + "entropy": 0.5177121743409321, + "epoch": 2.005354752342704, + "grad_norm": 0.33286598324775696, + "learning_rate": 0.00020120954818464854, + "loss": 0.4759817886352539, + "mean_token_accuracy": 0.8611076672871908, + "num_tokens": 1846410.0, + "step": 750 + }, + { + "entropy": 0.43643771946430204, + "epoch": 2.139223560910308, + "grad_norm": 0.3718855679035187, + "learning_rate": 0.00019944386163239588, + "loss": 0.3936069107055664, + "mean_token_accuracy": 0.8780064672231674, + "num_tokens": 1971547.0, + "step": 800 + }, + { + "entropy": 0.44890178814530374, + "epoch": 2.2730923694779115, + "grad_norm": 0.4402499496936798, + "learning_rate": 0.0001974698134581373, + "loss": 0.40427154541015625, + "mean_token_accuracy": 0.8763552361726761, + "num_tokens": 2088560.0, + "step": 850 + }, + { + "entropy": 0.44539201706647874, + "epoch": 2.4069611780455156, + "grad_norm": 0.437489777803421, + "learning_rate": 0.00019529170189988115, + "loss": 0.4049137878417969, + "mean_token_accuracy": 0.8759620261192321, + "num_tokens": 2210146.0, + "step": 900 + }, + { + "entropy": 0.4456777948141098, + "epoch": 2.540829986613119, + "grad_norm": 0.5221232175827026, + "learning_rate": 0.0001929142695176156, + "loss": 0.4049659729003906, + "mean_token_accuracy": 0.8767672145366668, + "num_tokens": 2331354.0, + "step": 950 + }, + { + "entropy": 0.44445386946201326, + "epoch": 2.674698795180723, + "grad_norm": 0.48457658290863037, + "learning_rate": 0.00019034269286698953, + "loss": 0.4065634536743164, + "mean_token_accuracy": 0.8764267575740814, + "num_tokens": 2452558.0, + "step": 1000 + }, + { + "entropy": 0.44160154819488523, + "epoch": 2.8085676037483265, + "grad_norm": 0.36902090907096863, + "learning_rate": 0.00018758257122802307, + "loss": 0.4023736953735352, + "mean_token_accuracy": 0.8786762475967407, + "num_tokens": 2582428.0, + "step": 1050 + }, + { + "entropy": 0.4473246121406555, + "epoch": 2.9424364123159306, + "grad_norm": 0.3760707676410675, + "learning_rate": 0.00018463991441338993, + "loss": 0.40938362121582034, + "mean_token_accuracy": 0.8754651814699173, + "num_tokens": 2707532.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.4759794683754444, + "eval_loss": 0.5672900676727295, + "eval_mean_token_accuracy": 0.843816783130169, + "eval_num_tokens": 2761311.0, + "eval_runtime": 49.9422, + "eval_samples_per_second": 31.997, + "eval_steps_per_second": 4.005, + "step": 1122 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.529510818991954e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a67d7b089767a1f6708064da5a07f65728eada3e --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_bias": false, + "lora_dropout": 0.0016857635936814886, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj", + "up_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d080195d92ab55de2f2d8b4ab836604414caae55 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/trainer_state.json @@ -0,0 +1,368 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 1496, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.3424121737480164, + "epoch": 0.13386880856760375, + "grad_norm": 1.1671011447906494, + "learning_rate": 2.7185774847148058e-05, + "loss": 1.2469227600097657, + "mean_token_accuracy": 0.7243366748094558, + "num_tokens": 122643.0, + "step": 50 + }, + { + "entropy": 0.6559184691309929, + "epoch": 0.2677376171352075, + "grad_norm": 0.8399614691734314, + "learning_rate": 5.4926361425870564e-05, + "loss": 0.6245294189453126, + "mean_token_accuracy": 0.8268856984376908, + "num_tokens": 245702.0, + "step": 100 + }, + { + "entropy": 0.6082554489374161, + "epoch": 0.40160642570281124, + "grad_norm": 0.5691282749176025, + "learning_rate": 8.266694800459306e-05, + "loss": 0.5804204177856446, + "mean_token_accuracy": 0.8374843555688858, + "num_tokens": 371834.0, + "step": 150 + }, + { + "entropy": 0.5805956655740738, + "epoch": 0.535475234270415, + "grad_norm": 0.45651012659072876, + "learning_rate": 0.00011040753458331558, + "loss": 0.5495451354980468, + "mean_token_accuracy": 0.8450068402290344, + "num_tokens": 500948.0, + "step": 200 + }, + { + "entropy": 0.5810796636343002, + "epoch": 0.6693440428380187, + "grad_norm": 0.493431955575943, + "learning_rate": 0.00013814812116203808, + "loss": 0.5546633911132812, + "mean_token_accuracy": 0.8431127589941024, + "num_tokens": 621874.0, + "step": 250 + }, + { + "entropy": 0.5683389616012573, + "epoch": 0.8032128514056225, + "grad_norm": 0.4698493182659149, + "learning_rate": 0.00016588870774076058, + "loss": 0.5354468536376953, + "mean_token_accuracy": 0.8477097982168198, + "num_tokens": 746138.0, + "step": 300 + }, + { + "entropy": 0.5579970148205757, + "epoch": 0.9370816599732262, + "grad_norm": 0.5637578964233398, + "learning_rate": 0.0001936292943194831, + "loss": 0.5262400054931641, + "mean_token_accuracy": 0.8487933957576752, + "num_tokens": 868331.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5953671643137932, + "eval_loss": 0.5836789608001709, + "eval_mean_token_accuracy": 0.8373425653576851, + "eval_num_tokens": 920437.0, + "eval_runtime": 49.8765, + "eval_samples_per_second": 32.039, + "eval_steps_per_second": 4.01, + "step": 374 + }, + { + "entropy": 0.5491663054986433, + "epoch": 1.069611780455154, + "grad_norm": 0.6051601767539978, + "learning_rate": 0.0002074713460228683, + "loss": 0.5132473373413086, + "mean_token_accuracy": 0.8526828770685677, + "num_tokens": 980838.0, + "step": 400 + }, + { + "entropy": 0.5334427100419998, + "epoch": 1.2034805890227578, + "grad_norm": 0.4304046630859375, + "learning_rate": 0.00020724550557791978, + "loss": 0.497388916015625, + "mean_token_accuracy": 0.8541187030076981, + "num_tokens": 1104763.0, + "step": 450 + }, + { + "entropy": 0.5297759872674942, + "epoch": 1.3373493975903614, + "grad_norm": 0.4243815541267395, + "learning_rate": 0.00020679431642677408, + "loss": 0.49724563598632815, + "mean_token_accuracy": 0.8557562667131424, + "num_tokens": 1230459.0, + "step": 500 + }, + { + "entropy": 0.5287212440371514, + "epoch": 1.4712182061579653, + "grad_norm": 0.5970085859298706, + "learning_rate": 0.0002061187609762355, + "loss": 0.4903334808349609, + "mean_token_accuracy": 0.8566980129480362, + "num_tokens": 1356368.0, + "step": 550 + }, + { + "entropy": 0.5196757692098618, + "epoch": 1.605087014725569, + "grad_norm": 0.353085458278656, + "learning_rate": 0.00020522031016209576, + "loss": 0.48056564331054685, + "mean_token_accuracy": 0.8591135066747665, + "num_tokens": 1484569.0, + "step": 600 + }, + { + "entropy": 0.5124905353784561, + "epoch": 1.7389558232931726, + "grad_norm": 0.383682519197464, + "learning_rate": 0.00020410092024635923, + "loss": 0.47599597930908205, + "mean_token_accuracy": 0.8606968414783478, + "num_tokens": 1609962.0, + "step": 650 + }, + { + "entropy": 0.5109778612852096, + "epoch": 1.8728246318607764, + "grad_norm": 0.35959598422050476, + "learning_rate": 0.00020276302855773176, + "loss": 0.47350929260253904, + "mean_token_accuracy": 0.862987876534462, + "num_tokens": 1729667.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.5806624293327332, + "eval_loss": 0.5672553181648254, + "eval_mean_token_accuracy": 0.8352122050523758, + "eval_num_tokens": 1840874.0, + "eval_runtime": 49.8669, + "eval_samples_per_second": 32.045, + "eval_steps_per_second": 4.011, + "step": 748 + }, + { + "entropy": 0.5177121743409321, + "epoch": 2.005354752342704, + "grad_norm": 0.33286598324775696, + "learning_rate": 0.00020120954818464854, + "loss": 0.4759817886352539, + "mean_token_accuracy": 0.8611076672871908, + "num_tokens": 1846410.0, + "step": 750 + }, + { + "entropy": 0.43643771946430204, + "epoch": 2.139223560910308, + "grad_norm": 0.3718855679035187, + "learning_rate": 0.00019944386163239588, + "loss": 0.3936069107055664, + "mean_token_accuracy": 0.8780064672231674, + "num_tokens": 1971547.0, + "step": 800 + }, + { + "entropy": 0.44890178814530374, + "epoch": 2.2730923694779115, + "grad_norm": 0.4402499496936798, + "learning_rate": 0.0001974698134581373, + "loss": 0.40427154541015625, + "mean_token_accuracy": 0.8763552361726761, + "num_tokens": 2088560.0, + "step": 850 + }, + { + "entropy": 0.44539201706647874, + "epoch": 2.4069611780455156, + "grad_norm": 0.437489777803421, + "learning_rate": 0.00019529170189988115, + "loss": 0.4049137878417969, + "mean_token_accuracy": 0.8759620261192321, + "num_tokens": 2210146.0, + "step": 900 + }, + { + "entropy": 0.4456777948141098, + "epoch": 2.540829986613119, + "grad_norm": 0.5221232175827026, + "learning_rate": 0.0001929142695176156, + "loss": 0.4049659729003906, + "mean_token_accuracy": 0.8767672145366668, + "num_tokens": 2331354.0, + "step": 950 + }, + { + "entropy": 0.44445386946201326, + "epoch": 2.674698795180723, + "grad_norm": 0.48457658290863037, + "learning_rate": 0.00019034269286698953, + "loss": 0.4065634536743164, + "mean_token_accuracy": 0.8764267575740814, + "num_tokens": 2452558.0, + "step": 1000 + }, + { + "entropy": 0.44160154819488523, + "epoch": 2.8085676037483265, + "grad_norm": 0.36902090907096863, + "learning_rate": 0.00018758257122802307, + "loss": 0.4023736953735352, + "mean_token_accuracy": 0.8786762475967407, + "num_tokens": 2582428.0, + "step": 1050 + }, + { + "entropy": 0.4473246121406555, + "epoch": 2.9424364123159306, + "grad_norm": 0.3760707676410675, + "learning_rate": 0.00018463991441338993, + "loss": 0.40938362121582034, + "mean_token_accuracy": 0.8754651814699173, + "num_tokens": 2707532.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.4759794683754444, + "eval_loss": 0.5672900676727295, + "eval_mean_token_accuracy": 0.843816783130169, + "eval_num_tokens": 2761311.0, + "eval_runtime": 49.9422, + "eval_samples_per_second": 31.997, + "eval_steps_per_second": 4.005, + "step": 1122 + }, + { + "entropy": 0.3893387094892637, + "epoch": 3.074966532797858, + "grad_norm": 0.4708668291568756, + "learning_rate": 0.00018152112968281706, + "loss": 0.3442184829711914, + "mean_token_accuracy": 0.8925870771359916, + "num_tokens": 2834344.0, + "step": 1150 + }, + { + "entropy": 0.36019587606191633, + "epoch": 3.208835341365462, + "grad_norm": 0.49764320254325867, + "learning_rate": 0.00017823300779209423, + "loss": 0.3140977668762207, + "mean_token_accuracy": 0.8975517880916596, + "num_tokens": 2952748.0, + "step": 1200 + }, + { + "entropy": 0.3615420612692833, + "epoch": 3.3427041499330654, + "grad_norm": 0.49308347702026367, + "learning_rate": 0.0001747827082070698, + "loss": 0.31728214263916016, + "mean_token_accuracy": 0.8988397961854935, + "num_tokens": 3080333.0, + "step": 1250 + }, + { + "entropy": 0.36845425054430964, + "epoch": 3.4765729585006695, + "grad_norm": 0.47647032141685486, + "learning_rate": 0.00017117774351482735, + "loss": 0.3203315734863281, + "mean_token_accuracy": 0.8968151319026947, + "num_tokens": 3206920.0, + "step": 1300 + }, + { + "entropy": 0.37361170917749403, + "epoch": 3.610441767068273, + "grad_norm": 0.5282555818557739, + "learning_rate": 0.000167425963065986, + "loss": 0.32945499420166013, + "mean_token_accuracy": 0.8949814730882645, + "num_tokens": 3327931.0, + "step": 1350 + }, + { + "entropy": 0.3739692223072052, + "epoch": 3.7443105756358768, + "grad_norm": 0.41107845306396484, + "learning_rate": 0.00016353553588374095, + "loss": 0.32604251861572264, + "mean_token_accuracy": 0.896143769621849, + "num_tokens": 3443874.0, + "step": 1400 + }, + { + "entropy": 0.37787127375602725, + "epoch": 3.878179384203481, + "grad_norm": 0.39750751852989197, + "learning_rate": 0.00015951493287685788, + "loss": 0.3352021026611328, + "mean_token_accuracy": 0.8935402005910873, + "num_tokens": 3573158.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.4485042405128479, + "eval_loss": 0.5753094553947449, + "eval_mean_token_accuracy": 0.8450798985362052, + "eval_num_tokens": 3681748.0, + "eval_runtime": 49.9605, + "eval_samples_per_second": 31.985, + "eval_steps_per_second": 4.003, + "step": 1496 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.0412494723875738e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a67d7b089767a1f6708064da5a07f65728eada3e --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_bias": false, + "lora_dropout": 0.0016857635936814886, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj", + "up_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c3ab89aba0dc0e69061b554799e0589717afef72 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/trainer_state.json @@ -0,0 +1,459 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 1870, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.3424121737480164, + "epoch": 0.13386880856760375, + "grad_norm": 1.1671011447906494, + "learning_rate": 2.7185774847148058e-05, + "loss": 1.2469227600097657, + "mean_token_accuracy": 0.7243366748094558, + "num_tokens": 122643.0, + "step": 50 + }, + { + "entropy": 0.6559184691309929, + "epoch": 0.2677376171352075, + "grad_norm": 0.8399614691734314, + "learning_rate": 5.4926361425870564e-05, + "loss": 0.6245294189453126, + "mean_token_accuracy": 0.8268856984376908, + "num_tokens": 245702.0, + "step": 100 + }, + { + "entropy": 0.6082554489374161, + "epoch": 0.40160642570281124, + "grad_norm": 0.5691282749176025, + "learning_rate": 8.266694800459306e-05, + "loss": 0.5804204177856446, + "mean_token_accuracy": 0.8374843555688858, + "num_tokens": 371834.0, + "step": 150 + }, + { + "entropy": 0.5805956655740738, + "epoch": 0.535475234270415, + "grad_norm": 0.45651012659072876, + "learning_rate": 0.00011040753458331558, + "loss": 0.5495451354980468, + "mean_token_accuracy": 0.8450068402290344, + "num_tokens": 500948.0, + "step": 200 + }, + { + "entropy": 0.5810796636343002, + "epoch": 0.6693440428380187, + "grad_norm": 0.493431955575943, + "learning_rate": 0.00013814812116203808, + "loss": 0.5546633911132812, + "mean_token_accuracy": 0.8431127589941024, + "num_tokens": 621874.0, + "step": 250 + }, + { + "entropy": 0.5683389616012573, + "epoch": 0.8032128514056225, + "grad_norm": 0.4698493182659149, + "learning_rate": 0.00016588870774076058, + "loss": 0.5354468536376953, + "mean_token_accuracy": 0.8477097982168198, + "num_tokens": 746138.0, + "step": 300 + }, + { + "entropy": 0.5579970148205757, + "epoch": 0.9370816599732262, + "grad_norm": 0.5637578964233398, + "learning_rate": 0.0001936292943194831, + "loss": 0.5262400054931641, + "mean_token_accuracy": 0.8487933957576752, + "num_tokens": 868331.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5953671643137932, + "eval_loss": 0.5836789608001709, + "eval_mean_token_accuracy": 0.8373425653576851, + "eval_num_tokens": 920437.0, + "eval_runtime": 49.8765, + "eval_samples_per_second": 32.039, + "eval_steps_per_second": 4.01, + "step": 374 + }, + { + "entropy": 0.5491663054986433, + "epoch": 1.069611780455154, + "grad_norm": 0.6051601767539978, + "learning_rate": 0.0002074713460228683, + "loss": 0.5132473373413086, + "mean_token_accuracy": 0.8526828770685677, + "num_tokens": 980838.0, + "step": 400 + }, + { + "entropy": 0.5334427100419998, + "epoch": 1.2034805890227578, + "grad_norm": 0.4304046630859375, + "learning_rate": 0.00020724550557791978, + "loss": 0.497388916015625, + "mean_token_accuracy": 0.8541187030076981, + "num_tokens": 1104763.0, + "step": 450 + }, + { + "entropy": 0.5297759872674942, + "epoch": 1.3373493975903614, + "grad_norm": 0.4243815541267395, + "learning_rate": 0.00020679431642677408, + "loss": 0.49724563598632815, + "mean_token_accuracy": 0.8557562667131424, + "num_tokens": 1230459.0, + "step": 500 + }, + { + "entropy": 0.5287212440371514, + "epoch": 1.4712182061579653, + "grad_norm": 0.5970085859298706, + "learning_rate": 0.0002061187609762355, + "loss": 0.4903334808349609, + "mean_token_accuracy": 0.8566980129480362, + "num_tokens": 1356368.0, + "step": 550 + }, + { + "entropy": 0.5196757692098618, + "epoch": 1.605087014725569, + "grad_norm": 0.353085458278656, + "learning_rate": 0.00020522031016209576, + "loss": 0.48056564331054685, + "mean_token_accuracy": 0.8591135066747665, + "num_tokens": 1484569.0, + "step": 600 + }, + { + "entropy": 0.5124905353784561, + "epoch": 1.7389558232931726, + "grad_norm": 0.383682519197464, + "learning_rate": 0.00020410092024635923, + "loss": 0.47599597930908205, + "mean_token_accuracy": 0.8606968414783478, + "num_tokens": 1609962.0, + "step": 650 + }, + { + "entropy": 0.5109778612852096, + "epoch": 1.8728246318607764, + "grad_norm": 0.35959598422050476, + "learning_rate": 0.00020276302855773176, + "loss": 0.47350929260253904, + "mean_token_accuracy": 0.862987876534462, + "num_tokens": 1729667.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.5806624293327332, + "eval_loss": 0.5672553181648254, + "eval_mean_token_accuracy": 0.8352122050523758, + "eval_num_tokens": 1840874.0, + "eval_runtime": 49.8669, + "eval_samples_per_second": 32.045, + "eval_steps_per_second": 4.011, + "step": 748 + }, + { + "entropy": 0.5177121743409321, + "epoch": 2.005354752342704, + "grad_norm": 0.33286598324775696, + "learning_rate": 0.00020120954818464854, + "loss": 0.4759817886352539, + "mean_token_accuracy": 0.8611076672871908, + "num_tokens": 1846410.0, + "step": 750 + }, + { + "entropy": 0.43643771946430204, + "epoch": 2.139223560910308, + "grad_norm": 0.3718855679035187, + "learning_rate": 0.00019944386163239588, + "loss": 0.3936069107055664, + "mean_token_accuracy": 0.8780064672231674, + "num_tokens": 1971547.0, + "step": 800 + }, + { + "entropy": 0.44890178814530374, + "epoch": 2.2730923694779115, + "grad_norm": 0.4402499496936798, + "learning_rate": 0.0001974698134581373, + "loss": 0.40427154541015625, + "mean_token_accuracy": 0.8763552361726761, + "num_tokens": 2088560.0, + "step": 850 + }, + { + "entropy": 0.44539201706647874, + "epoch": 2.4069611780455156, + "grad_norm": 0.437489777803421, + "learning_rate": 0.00019529170189988115, + "loss": 0.4049137878417969, + "mean_token_accuracy": 0.8759620261192321, + "num_tokens": 2210146.0, + "step": 900 + }, + { + "entropy": 0.4456777948141098, + "epoch": 2.540829986613119, + "grad_norm": 0.5221232175827026, + "learning_rate": 0.0001929142695176156, + "loss": 0.4049659729003906, + "mean_token_accuracy": 0.8767672145366668, + "num_tokens": 2331354.0, + "step": 950 + }, + { + "entropy": 0.44445386946201326, + "epoch": 2.674698795180723, + "grad_norm": 0.48457658290863037, + "learning_rate": 0.00019034269286698953, + "loss": 0.4065634536743164, + "mean_token_accuracy": 0.8764267575740814, + "num_tokens": 2452558.0, + "step": 1000 + }, + { + "entropy": 0.44160154819488523, + "epoch": 2.8085676037483265, + "grad_norm": 0.36902090907096863, + "learning_rate": 0.00018758257122802307, + "loss": 0.4023736953735352, + "mean_token_accuracy": 0.8786762475967407, + "num_tokens": 2582428.0, + "step": 1050 + }, + { + "entropy": 0.4473246121406555, + "epoch": 2.9424364123159306, + "grad_norm": 0.3760707676410675, + "learning_rate": 0.00018463991441338993, + "loss": 0.40938362121582034, + "mean_token_accuracy": 0.8754651814699173, + "num_tokens": 2707532.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.4759794683754444, + "eval_loss": 0.5672900676727295, + "eval_mean_token_accuracy": 0.843816783130169, + "eval_num_tokens": 2761311.0, + "eval_runtime": 49.9422, + "eval_samples_per_second": 31.997, + "eval_steps_per_second": 4.005, + "step": 1122 + }, + { + "entropy": 0.3893387094892637, + "epoch": 3.074966532797858, + "grad_norm": 0.4708668291568756, + "learning_rate": 0.00018152112968281706, + "loss": 0.3442184829711914, + "mean_token_accuracy": 0.8925870771359916, + "num_tokens": 2834344.0, + "step": 1150 + }, + { + "entropy": 0.36019587606191633, + "epoch": 3.208835341365462, + "grad_norm": 0.49764320254325867, + "learning_rate": 0.00017823300779209423, + "loss": 0.3140977668762207, + "mean_token_accuracy": 0.8975517880916596, + "num_tokens": 2952748.0, + "step": 1200 + }, + { + "entropy": 0.3615420612692833, + "epoch": 3.3427041499330654, + "grad_norm": 0.49308347702026367, + "learning_rate": 0.0001747827082070698, + "loss": 0.31728214263916016, + "mean_token_accuracy": 0.8988397961854935, + "num_tokens": 3080333.0, + "step": 1250 + }, + { + "entropy": 0.36845425054430964, + "epoch": 3.4765729585006695, + "grad_norm": 0.47647032141685486, + "learning_rate": 0.00017117774351482735, + "loss": 0.3203315734863281, + "mean_token_accuracy": 0.8968151319026947, + "num_tokens": 3206920.0, + "step": 1300 + }, + { + "entropy": 0.37361170917749403, + "epoch": 3.610441767068273, + "grad_norm": 0.5282555818557739, + "learning_rate": 0.000167425963065986, + "loss": 0.32945499420166013, + "mean_token_accuracy": 0.8949814730882645, + "num_tokens": 3327931.0, + "step": 1350 + }, + { + "entropy": 0.3739692223072052, + "epoch": 3.7443105756358768, + "grad_norm": 0.41107845306396484, + "learning_rate": 0.00016353553588374095, + "loss": 0.32604251861572264, + "mean_token_accuracy": 0.896143769621849, + "num_tokens": 3443874.0, + "step": 1400 + }, + { + "entropy": 0.37787127375602725, + "epoch": 3.878179384203481, + "grad_norm": 0.39750751852989197, + "learning_rate": 0.00015951493287685788, + "loss": 0.3352021026611328, + "mean_token_accuracy": 0.8935402005910873, + "num_tokens": 3573158.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.4485042405128479, + "eval_loss": 0.5753094553947449, + "eval_mean_token_accuracy": 0.8450798985362052, + "eval_num_tokens": 3681748.0, + "eval_runtime": 49.9605, + "eval_samples_per_second": 31.985, + "eval_steps_per_second": 4.003, + "step": 1496 + }, + { + "entropy": 0.3735277107869736, + "epoch": 4.010709504685408, + "grad_norm": 0.41153082251548767, + "learning_rate": 0.00015537290839535005, + "loss": 0.327095947265625, + "mean_token_accuracy": 0.8959399853089843, + "num_tokens": 3691654.0, + "step": 1500 + }, + { + "entropy": 0.2689096394181252, + "epoch": 4.144578313253012, + "grad_norm": 0.5363789200782776, + "learning_rate": 0.00015111848116899814, + "loss": 0.2247480583190918, + "mean_token_accuracy": 0.9249986118078232, + "num_tokens": 3813863.0, + "step": 1550 + }, + { + "entropy": 0.27684757232666013, + "epoch": 4.278447121820616, + "grad_norm": 0.5589100122451782, + "learning_rate": 0.00014676091467021694, + "loss": 0.23430667877197264, + "mean_token_accuracy": 0.9212016260623932, + "num_tokens": 3942009.0, + "step": 1600 + }, + { + "entropy": 0.27285940989851953, + "epoch": 4.412315930388219, + "grad_norm": 0.4415719211101532, + "learning_rate": 0.00014230969694402636, + "loss": 0.23151195526123047, + "mean_token_accuracy": 0.922565575838089, + "num_tokens": 4067146.0, + "step": 1650 + }, + { + "entropy": 0.28027778953313826, + "epoch": 4.546184738955823, + "grad_norm": 0.544822096824646, + "learning_rate": 0.0001377745199490439, + "loss": 0.23426279067993164, + "mean_token_accuracy": 0.9214058065414429, + "num_tokens": 4186586.0, + "step": 1700 + }, + { + "entropy": 0.2855076715350151, + "epoch": 4.680053547523427, + "grad_norm": 0.47745293378829956, + "learning_rate": 0.00013316525845448153, + "loss": 0.2384078598022461, + "mean_token_accuracy": 0.9208149307966232, + "num_tokens": 4307001.0, + "step": 1750 + }, + { + "entropy": 0.28488670364022256, + "epoch": 4.813922356091031, + "grad_norm": 0.6087909936904907, + "learning_rate": 0.00012849194853909585, + "loss": 0.24047565460205078, + "mean_token_accuracy": 0.9198217475414276, + "num_tokens": 4429513.0, + "step": 1800 + }, + { + "entropy": 0.2799084801971912, + "epoch": 4.947791164658635, + "grad_norm": 0.4444660544395447, + "learning_rate": 0.00012376476573890707, + "loss": 0.23463037490844726, + "mean_token_accuracy": 0.9206935846805573, + "num_tokens": 4557562.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.3690616069734097, + "eval_loss": 0.6533966064453125, + "eval_mean_token_accuracy": 0.8432427588105201, + "eval_num_tokens": 4602185.0, + "eval_runtime": 49.893, + "eval_samples_per_second": 32.029, + "eval_steps_per_second": 4.009, + "step": 1870 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.545178197920768e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a67d7b089767a1f6708064da5a07f65728eada3e --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_bias": false, + "lora_dropout": 0.0016857635936814886, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj", + "up_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dc42c5a0a68bd40d57e512b9172007bee433d499 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/trainer_state.json @@ -0,0 +1,540 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.0, + "eval_steps": 500, + "global_step": 2244, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.3424121737480164, + "epoch": 0.13386880856760375, + "grad_norm": 1.1671011447906494, + "learning_rate": 2.7185774847148058e-05, + "loss": 1.2469227600097657, + "mean_token_accuracy": 0.7243366748094558, + "num_tokens": 122643.0, + "step": 50 + }, + { + "entropy": 0.6559184691309929, + "epoch": 0.2677376171352075, + "grad_norm": 0.8399614691734314, + "learning_rate": 5.4926361425870564e-05, + "loss": 0.6245294189453126, + "mean_token_accuracy": 0.8268856984376908, + "num_tokens": 245702.0, + "step": 100 + }, + { + "entropy": 0.6082554489374161, + "epoch": 0.40160642570281124, + "grad_norm": 0.5691282749176025, + "learning_rate": 8.266694800459306e-05, + "loss": 0.5804204177856446, + "mean_token_accuracy": 0.8374843555688858, + "num_tokens": 371834.0, + "step": 150 + }, + { + "entropy": 0.5805956655740738, + "epoch": 0.535475234270415, + "grad_norm": 0.45651012659072876, + "learning_rate": 0.00011040753458331558, + "loss": 0.5495451354980468, + "mean_token_accuracy": 0.8450068402290344, + "num_tokens": 500948.0, + "step": 200 + }, + { + "entropy": 0.5810796636343002, + "epoch": 0.6693440428380187, + "grad_norm": 0.493431955575943, + "learning_rate": 0.00013814812116203808, + "loss": 0.5546633911132812, + "mean_token_accuracy": 0.8431127589941024, + "num_tokens": 621874.0, + "step": 250 + }, + { + "entropy": 0.5683389616012573, + "epoch": 0.8032128514056225, + "grad_norm": 0.4698493182659149, + "learning_rate": 0.00016588870774076058, + "loss": 0.5354468536376953, + "mean_token_accuracy": 0.8477097982168198, + "num_tokens": 746138.0, + "step": 300 + }, + { + "entropy": 0.5579970148205757, + "epoch": 0.9370816599732262, + "grad_norm": 0.5637578964233398, + "learning_rate": 0.0001936292943194831, + "loss": 0.5262400054931641, + "mean_token_accuracy": 0.8487933957576752, + "num_tokens": 868331.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5953671643137932, + "eval_loss": 0.5836789608001709, + "eval_mean_token_accuracy": 0.8373425653576851, + "eval_num_tokens": 920437.0, + "eval_runtime": 49.8765, + "eval_samples_per_second": 32.039, + "eval_steps_per_second": 4.01, + "step": 374 + }, + { + "entropy": 0.5491663054986433, + "epoch": 1.069611780455154, + "grad_norm": 0.6051601767539978, + "learning_rate": 0.0002074713460228683, + "loss": 0.5132473373413086, + "mean_token_accuracy": 0.8526828770685677, + "num_tokens": 980838.0, + "step": 400 + }, + { + "entropy": 0.5334427100419998, + "epoch": 1.2034805890227578, + "grad_norm": 0.4304046630859375, + "learning_rate": 0.00020724550557791978, + "loss": 0.497388916015625, + "mean_token_accuracy": 0.8541187030076981, + "num_tokens": 1104763.0, + "step": 450 + }, + { + "entropy": 0.5297759872674942, + "epoch": 1.3373493975903614, + "grad_norm": 0.4243815541267395, + "learning_rate": 0.00020679431642677408, + "loss": 0.49724563598632815, + "mean_token_accuracy": 0.8557562667131424, + "num_tokens": 1230459.0, + "step": 500 + }, + { + "entropy": 0.5287212440371514, + "epoch": 1.4712182061579653, + "grad_norm": 0.5970085859298706, + "learning_rate": 0.0002061187609762355, + "loss": 0.4903334808349609, + "mean_token_accuracy": 0.8566980129480362, + "num_tokens": 1356368.0, + "step": 550 + }, + { + "entropy": 0.5196757692098618, + "epoch": 1.605087014725569, + "grad_norm": 0.353085458278656, + "learning_rate": 0.00020522031016209576, + "loss": 0.48056564331054685, + "mean_token_accuracy": 0.8591135066747665, + "num_tokens": 1484569.0, + "step": 600 + }, + { + "entropy": 0.5124905353784561, + "epoch": 1.7389558232931726, + "grad_norm": 0.383682519197464, + "learning_rate": 0.00020410092024635923, + "loss": 0.47599597930908205, + "mean_token_accuracy": 0.8606968414783478, + "num_tokens": 1609962.0, + "step": 650 + }, + { + "entropy": 0.5109778612852096, + "epoch": 1.8728246318607764, + "grad_norm": 0.35959598422050476, + "learning_rate": 0.00020276302855773176, + "loss": 0.47350929260253904, + "mean_token_accuracy": 0.862987876534462, + "num_tokens": 1729667.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.5806624293327332, + "eval_loss": 0.5672553181648254, + "eval_mean_token_accuracy": 0.8352122050523758, + "eval_num_tokens": 1840874.0, + "eval_runtime": 49.8669, + "eval_samples_per_second": 32.045, + "eval_steps_per_second": 4.011, + "step": 748 + }, + { + "entropy": 0.5177121743409321, + "epoch": 2.005354752342704, + "grad_norm": 0.33286598324775696, + "learning_rate": 0.00020120954818464854, + "loss": 0.4759817886352539, + "mean_token_accuracy": 0.8611076672871908, + "num_tokens": 1846410.0, + "step": 750 + }, + { + "entropy": 0.43643771946430204, + "epoch": 2.139223560910308, + "grad_norm": 0.3718855679035187, + "learning_rate": 0.00019944386163239588, + "loss": 0.3936069107055664, + "mean_token_accuracy": 0.8780064672231674, + "num_tokens": 1971547.0, + "step": 800 + }, + { + "entropy": 0.44890178814530374, + "epoch": 2.2730923694779115, + "grad_norm": 0.4402499496936798, + "learning_rate": 0.0001974698134581373, + "loss": 0.40427154541015625, + "mean_token_accuracy": 0.8763552361726761, + "num_tokens": 2088560.0, + "step": 850 + }, + { + "entropy": 0.44539201706647874, + "epoch": 2.4069611780455156, + "grad_norm": 0.437489777803421, + "learning_rate": 0.00019529170189988115, + "loss": 0.4049137878417969, + "mean_token_accuracy": 0.8759620261192321, + "num_tokens": 2210146.0, + "step": 900 + }, + { + "entropy": 0.4456777948141098, + "epoch": 2.540829986613119, + "grad_norm": 0.5221232175827026, + "learning_rate": 0.0001929142695176156, + "loss": 0.4049659729003906, + "mean_token_accuracy": 0.8767672145366668, + "num_tokens": 2331354.0, + "step": 950 + }, + { + "entropy": 0.44445386946201326, + "epoch": 2.674698795180723, + "grad_norm": 0.48457658290863037, + "learning_rate": 0.00019034269286698953, + "loss": 0.4065634536743164, + "mean_token_accuracy": 0.8764267575740814, + "num_tokens": 2452558.0, + "step": 1000 + }, + { + "entropy": 0.44160154819488523, + "epoch": 2.8085676037483265, + "grad_norm": 0.36902090907096863, + "learning_rate": 0.00018758257122802307, + "loss": 0.4023736953735352, + "mean_token_accuracy": 0.8786762475967407, + "num_tokens": 2582428.0, + "step": 1050 + }, + { + "entropy": 0.4473246121406555, + "epoch": 2.9424364123159306, + "grad_norm": 0.3760707676410675, + "learning_rate": 0.00018463991441338993, + "loss": 0.40938362121582034, + "mean_token_accuracy": 0.8754651814699173, + "num_tokens": 2707532.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.4759794683754444, + "eval_loss": 0.5672900676727295, + "eval_mean_token_accuracy": 0.843816783130169, + "eval_num_tokens": 2761311.0, + "eval_runtime": 49.9422, + "eval_samples_per_second": 31.997, + "eval_steps_per_second": 4.005, + "step": 1122 + }, + { + "entropy": 0.3893387094892637, + "epoch": 3.074966532797858, + "grad_norm": 0.4708668291568756, + "learning_rate": 0.00018152112968281706, + "loss": 0.3442184829711914, + "mean_token_accuracy": 0.8925870771359916, + "num_tokens": 2834344.0, + "step": 1150 + }, + { + "entropy": 0.36019587606191633, + "epoch": 3.208835341365462, + "grad_norm": 0.49764320254325867, + "learning_rate": 0.00017823300779209423, + "loss": 0.3140977668762207, + "mean_token_accuracy": 0.8975517880916596, + "num_tokens": 2952748.0, + "step": 1200 + }, + { + "entropy": 0.3615420612692833, + "epoch": 3.3427041499330654, + "grad_norm": 0.49308347702026367, + "learning_rate": 0.0001747827082070698, + "loss": 0.31728214263916016, + "mean_token_accuracy": 0.8988397961854935, + "num_tokens": 3080333.0, + "step": 1250 + }, + { + "entropy": 0.36845425054430964, + "epoch": 3.4765729585006695, + "grad_norm": 0.47647032141685486, + "learning_rate": 0.00017117774351482735, + "loss": 0.3203315734863281, + "mean_token_accuracy": 0.8968151319026947, + "num_tokens": 3206920.0, + "step": 1300 + }, + { + "entropy": 0.37361170917749403, + "epoch": 3.610441767068273, + "grad_norm": 0.5282555818557739, + "learning_rate": 0.000167425963065986, + "loss": 0.32945499420166013, + "mean_token_accuracy": 0.8949814730882645, + "num_tokens": 3327931.0, + "step": 1350 + }, + { + "entropy": 0.3739692223072052, + "epoch": 3.7443105756358768, + "grad_norm": 0.41107845306396484, + "learning_rate": 0.00016353553588374095, + "loss": 0.32604251861572264, + "mean_token_accuracy": 0.896143769621849, + "num_tokens": 3443874.0, + "step": 1400 + }, + { + "entropy": 0.37787127375602725, + "epoch": 3.878179384203481, + "grad_norm": 0.39750751852989197, + "learning_rate": 0.00015951493287685788, + "loss": 0.3352021026611328, + "mean_token_accuracy": 0.8935402005910873, + "num_tokens": 3573158.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.4485042405128479, + "eval_loss": 0.5753094553947449, + "eval_mean_token_accuracy": 0.8450798985362052, + "eval_num_tokens": 3681748.0, + "eval_runtime": 49.9605, + "eval_samples_per_second": 31.985, + "eval_steps_per_second": 4.003, + "step": 1496 + }, + { + "entropy": 0.3735277107869736, + "epoch": 4.010709504685408, + "grad_norm": 0.41153082251548767, + "learning_rate": 0.00015537290839535005, + "loss": 0.327095947265625, + "mean_token_accuracy": 0.8959399853089843, + "num_tokens": 3691654.0, + "step": 1500 + }, + { + "entropy": 0.2689096394181252, + "epoch": 4.144578313253012, + "grad_norm": 0.5363789200782776, + "learning_rate": 0.00015111848116899814, + "loss": 0.2247480583190918, + "mean_token_accuracy": 0.9249986118078232, + "num_tokens": 3813863.0, + "step": 1550 + }, + { + "entropy": 0.27684757232666013, + "epoch": 4.278447121820616, + "grad_norm": 0.5589100122451782, + "learning_rate": 0.00014676091467021694, + "loss": 0.23430667877197264, + "mean_token_accuracy": 0.9212016260623932, + "num_tokens": 3942009.0, + "step": 1600 + }, + { + "entropy": 0.27285940989851953, + "epoch": 4.412315930388219, + "grad_norm": 0.4415719211101532, + "learning_rate": 0.00014230969694402636, + "loss": 0.23151195526123047, + "mean_token_accuracy": 0.922565575838089, + "num_tokens": 4067146.0, + "step": 1650 + }, + { + "entropy": 0.28027778953313826, + "epoch": 4.546184738955823, + "grad_norm": 0.544822096824646, + "learning_rate": 0.0001377745199490439, + "loss": 0.23426279067993164, + "mean_token_accuracy": 0.9214058065414429, + "num_tokens": 4186586.0, + "step": 1700 + }, + { + "entropy": 0.2855076715350151, + "epoch": 4.680053547523427, + "grad_norm": 0.47745293378829956, + "learning_rate": 0.00013316525845448153, + "loss": 0.2384078598022461, + "mean_token_accuracy": 0.9208149307966232, + "num_tokens": 4307001.0, + "step": 1750 + }, + { + "entropy": 0.28488670364022256, + "epoch": 4.813922356091031, + "grad_norm": 0.6087909936904907, + "learning_rate": 0.00012849194853909585, + "loss": 0.24047565460205078, + "mean_token_accuracy": 0.9198217475414276, + "num_tokens": 4429513.0, + "step": 1800 + }, + { + "entropy": 0.2799084801971912, + "epoch": 4.947791164658635, + "grad_norm": 0.4444660544395447, + "learning_rate": 0.00012376476573890707, + "loss": 0.23463037490844726, + "mean_token_accuracy": 0.9206935846805573, + "num_tokens": 4557562.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.3690616069734097, + "eval_loss": 0.6533966064453125, + "eval_mean_token_accuracy": 0.8432427588105201, + "eval_num_tokens": 4602185.0, + "eval_runtime": 49.893, + "eval_samples_per_second": 32.029, + "eval_steps_per_second": 4.009, + "step": 1870 + }, + { + "entropy": 0.2288022293436407, + "epoch": 5.080321285140562, + "grad_norm": 0.4002731442451477, + "learning_rate": 0.0001189940028912678, + "loss": 0.17887537002563478, + "mean_token_accuracy": 0.9409801201386885, + "num_tokens": 4679483.0, + "step": 1900 + }, + { + "entropy": 0.192287794649601, + "epoch": 5.214190093708166, + "grad_norm": 0.4603167474269867, + "learning_rate": 0.00011419004772352316, + "loss": 0.14474411010742189, + "mean_token_accuracy": 0.9518448287248611, + "num_tokens": 4800131.0, + "step": 1950 + }, + { + "entropy": 0.19072901770472528, + "epoch": 5.34805890227577, + "grad_norm": 0.6232134103775024, + "learning_rate": 0.00010936336023505987, + "loss": 0.14428988456726075, + "mean_token_accuracy": 0.9511868554353714, + "num_tokens": 4923382.0, + "step": 2000 + }, + { + "entropy": 0.19722454741597176, + "epoch": 5.481927710843373, + "grad_norm": 0.49368390440940857, + "learning_rate": 0.00010452444992199237, + "loss": 0.15026931762695311, + "mean_token_accuracy": 0.9493078935146332, + "num_tokens": 5042200.0, + "step": 2050 + }, + { + "entropy": 0.1921817621588707, + "epoch": 5.615796519410977, + "grad_norm": 0.6033351421356201, + "learning_rate": 9.9683852894076e-05, + "loss": 0.15000157356262206, + "mean_token_accuracy": 0.9497224026918412, + "num_tokens": 5168285.0, + "step": 2100 + }, + { + "entropy": 0.19455582827329634, + "epoch": 5.749665327978581, + "grad_norm": 0.4534570276737213, + "learning_rate": 9.485210893367247e-05, + "loss": 0.14963313102722167, + "mean_token_accuracy": 0.94916872382164, + "num_tokens": 5289880.0, + "step": 2150 + }, + { + "entropy": 0.18738240271806716, + "epoch": 5.883534136546185, + "grad_norm": 0.5815815329551697, + "learning_rate": 9.003973854671866e-05, + "loss": 0.14579124450683595, + "mean_token_accuracy": 0.9498835545778275, + "num_tokens": 5413325.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.29356860227882864, + "eval_loss": 0.7589722275733948, + "eval_mean_token_accuracy": 0.8418136316537858, + "eval_num_tokens": 5522622.0, + "eval_runtime": 49.8769, + "eval_samples_per_second": 32.039, + "eval_steps_per_second": 4.01, + "step": 2244 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.0504018349848576e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a67d7b089767a1f6708064da5a07f65728eada3e --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_bias": false, + "lora_dropout": 0.0016857635936814886, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj", + "up_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cb2a6299585cd25931e76a4a84d0252e0b22078d --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/trainer_state.json @@ -0,0 +1,631 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.0, + "eval_steps": 500, + "global_step": 2618, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.3424121737480164, + "epoch": 0.13386880856760375, + "grad_norm": 1.1671011447906494, + "learning_rate": 2.7185774847148058e-05, + "loss": 1.2469227600097657, + "mean_token_accuracy": 0.7243366748094558, + "num_tokens": 122643.0, + "step": 50 + }, + { + "entropy": 0.6559184691309929, + "epoch": 0.2677376171352075, + "grad_norm": 0.8399614691734314, + "learning_rate": 5.4926361425870564e-05, + "loss": 0.6245294189453126, + "mean_token_accuracy": 0.8268856984376908, + "num_tokens": 245702.0, + "step": 100 + }, + { + "entropy": 0.6082554489374161, + "epoch": 0.40160642570281124, + "grad_norm": 0.5691282749176025, + "learning_rate": 8.266694800459306e-05, + "loss": 0.5804204177856446, + "mean_token_accuracy": 0.8374843555688858, + "num_tokens": 371834.0, + "step": 150 + }, + { + "entropy": 0.5805956655740738, + "epoch": 0.535475234270415, + "grad_norm": 0.45651012659072876, + "learning_rate": 0.00011040753458331558, + "loss": 0.5495451354980468, + "mean_token_accuracy": 0.8450068402290344, + "num_tokens": 500948.0, + "step": 200 + }, + { + "entropy": 0.5810796636343002, + "epoch": 0.6693440428380187, + "grad_norm": 0.493431955575943, + "learning_rate": 0.00013814812116203808, + "loss": 0.5546633911132812, + "mean_token_accuracy": 0.8431127589941024, + "num_tokens": 621874.0, + "step": 250 + }, + { + "entropy": 0.5683389616012573, + "epoch": 0.8032128514056225, + "grad_norm": 0.4698493182659149, + "learning_rate": 0.00016588870774076058, + "loss": 0.5354468536376953, + "mean_token_accuracy": 0.8477097982168198, + "num_tokens": 746138.0, + "step": 300 + }, + { + "entropy": 0.5579970148205757, + "epoch": 0.9370816599732262, + "grad_norm": 0.5637578964233398, + "learning_rate": 0.0001936292943194831, + "loss": 0.5262400054931641, + "mean_token_accuracy": 0.8487933957576752, + "num_tokens": 868331.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5953671643137932, + "eval_loss": 0.5836789608001709, + "eval_mean_token_accuracy": 0.8373425653576851, + "eval_num_tokens": 920437.0, + "eval_runtime": 49.8765, + "eval_samples_per_second": 32.039, + "eval_steps_per_second": 4.01, + "step": 374 + }, + { + "entropy": 0.5491663054986433, + "epoch": 1.069611780455154, + "grad_norm": 0.6051601767539978, + "learning_rate": 0.0002074713460228683, + "loss": 0.5132473373413086, + "mean_token_accuracy": 0.8526828770685677, + "num_tokens": 980838.0, + "step": 400 + }, + { + "entropy": 0.5334427100419998, + "epoch": 1.2034805890227578, + "grad_norm": 0.4304046630859375, + "learning_rate": 0.00020724550557791978, + "loss": 0.497388916015625, + "mean_token_accuracy": 0.8541187030076981, + "num_tokens": 1104763.0, + "step": 450 + }, + { + "entropy": 0.5297759872674942, + "epoch": 1.3373493975903614, + "grad_norm": 0.4243815541267395, + "learning_rate": 0.00020679431642677408, + "loss": 0.49724563598632815, + "mean_token_accuracy": 0.8557562667131424, + "num_tokens": 1230459.0, + "step": 500 + }, + { + "entropy": 0.5287212440371514, + "epoch": 1.4712182061579653, + "grad_norm": 0.5970085859298706, + "learning_rate": 0.0002061187609762355, + "loss": 0.4903334808349609, + "mean_token_accuracy": 0.8566980129480362, + "num_tokens": 1356368.0, + "step": 550 + }, + { + "entropy": 0.5196757692098618, + "epoch": 1.605087014725569, + "grad_norm": 0.353085458278656, + "learning_rate": 0.00020522031016209576, + "loss": 0.48056564331054685, + "mean_token_accuracy": 0.8591135066747665, + "num_tokens": 1484569.0, + "step": 600 + }, + { + "entropy": 0.5124905353784561, + "epoch": 1.7389558232931726, + "grad_norm": 0.383682519197464, + "learning_rate": 0.00020410092024635923, + "loss": 0.47599597930908205, + "mean_token_accuracy": 0.8606968414783478, + "num_tokens": 1609962.0, + "step": 650 + }, + { + "entropy": 0.5109778612852096, + "epoch": 1.8728246318607764, + "grad_norm": 0.35959598422050476, + "learning_rate": 0.00020276302855773176, + "loss": 0.47350929260253904, + "mean_token_accuracy": 0.862987876534462, + "num_tokens": 1729667.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.5806624293327332, + "eval_loss": 0.5672553181648254, + "eval_mean_token_accuracy": 0.8352122050523758, + "eval_num_tokens": 1840874.0, + "eval_runtime": 49.8669, + "eval_samples_per_second": 32.045, + "eval_steps_per_second": 4.011, + "step": 748 + }, + { + "entropy": 0.5177121743409321, + "epoch": 2.005354752342704, + "grad_norm": 0.33286598324775696, + "learning_rate": 0.00020120954818464854, + "loss": 0.4759817886352539, + "mean_token_accuracy": 0.8611076672871908, + "num_tokens": 1846410.0, + "step": 750 + }, + { + "entropy": 0.43643771946430204, + "epoch": 2.139223560910308, + "grad_norm": 0.3718855679035187, + "learning_rate": 0.00019944386163239588, + "loss": 0.3936069107055664, + "mean_token_accuracy": 0.8780064672231674, + "num_tokens": 1971547.0, + "step": 800 + }, + { + "entropy": 0.44890178814530374, + "epoch": 2.2730923694779115, + "grad_norm": 0.4402499496936798, + "learning_rate": 0.0001974698134581373, + "loss": 0.40427154541015625, + "mean_token_accuracy": 0.8763552361726761, + "num_tokens": 2088560.0, + "step": 850 + }, + { + "entropy": 0.44539201706647874, + "epoch": 2.4069611780455156, + "grad_norm": 0.437489777803421, + "learning_rate": 0.00019529170189988115, + "loss": 0.4049137878417969, + "mean_token_accuracy": 0.8759620261192321, + "num_tokens": 2210146.0, + "step": 900 + }, + { + "entropy": 0.4456777948141098, + "epoch": 2.540829986613119, + "grad_norm": 0.5221232175827026, + "learning_rate": 0.0001929142695176156, + "loss": 0.4049659729003906, + "mean_token_accuracy": 0.8767672145366668, + "num_tokens": 2331354.0, + "step": 950 + }, + { + "entropy": 0.44445386946201326, + "epoch": 2.674698795180723, + "grad_norm": 0.48457658290863037, + "learning_rate": 0.00019034269286698953, + "loss": 0.4065634536743164, + "mean_token_accuracy": 0.8764267575740814, + "num_tokens": 2452558.0, + "step": 1000 + }, + { + "entropy": 0.44160154819488523, + "epoch": 2.8085676037483265, + "grad_norm": 0.36902090907096863, + "learning_rate": 0.00018758257122802307, + "loss": 0.4023736953735352, + "mean_token_accuracy": 0.8786762475967407, + "num_tokens": 2582428.0, + "step": 1050 + }, + { + "entropy": 0.4473246121406555, + "epoch": 2.9424364123159306, + "grad_norm": 0.3760707676410675, + "learning_rate": 0.00018463991441338993, + "loss": 0.40938362121582034, + "mean_token_accuracy": 0.8754651814699173, + "num_tokens": 2707532.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.4759794683754444, + "eval_loss": 0.5672900676727295, + "eval_mean_token_accuracy": 0.843816783130169, + "eval_num_tokens": 2761311.0, + "eval_runtime": 49.9422, + "eval_samples_per_second": 31.997, + "eval_steps_per_second": 4.005, + "step": 1122 + }, + { + "entropy": 0.3893387094892637, + "epoch": 3.074966532797858, + "grad_norm": 0.4708668291568756, + "learning_rate": 0.00018152112968281706, + "loss": 0.3442184829711914, + "mean_token_accuracy": 0.8925870771359916, + "num_tokens": 2834344.0, + "step": 1150 + }, + { + "entropy": 0.36019587606191633, + "epoch": 3.208835341365462, + "grad_norm": 0.49764320254325867, + "learning_rate": 0.00017823300779209423, + "loss": 0.3140977668762207, + "mean_token_accuracy": 0.8975517880916596, + "num_tokens": 2952748.0, + "step": 1200 + }, + { + "entropy": 0.3615420612692833, + "epoch": 3.3427041499330654, + "grad_norm": 0.49308347702026367, + "learning_rate": 0.0001747827082070698, + "loss": 0.31728214263916016, + "mean_token_accuracy": 0.8988397961854935, + "num_tokens": 3080333.0, + "step": 1250 + }, + { + "entropy": 0.36845425054430964, + "epoch": 3.4765729585006695, + "grad_norm": 0.47647032141685486, + "learning_rate": 0.00017117774351482735, + "loss": 0.3203315734863281, + "mean_token_accuracy": 0.8968151319026947, + "num_tokens": 3206920.0, + "step": 1300 + }, + { + "entropy": 0.37361170917749403, + "epoch": 3.610441767068273, + "grad_norm": 0.5282555818557739, + "learning_rate": 0.000167425963065986, + "loss": 0.32945499420166013, + "mean_token_accuracy": 0.8949814730882645, + "num_tokens": 3327931.0, + "step": 1350 + }, + { + "entropy": 0.3739692223072052, + "epoch": 3.7443105756358768, + "grad_norm": 0.41107845306396484, + "learning_rate": 0.00016353553588374095, + "loss": 0.32604251861572264, + "mean_token_accuracy": 0.896143769621849, + "num_tokens": 3443874.0, + "step": 1400 + }, + { + "entropy": 0.37787127375602725, + "epoch": 3.878179384203481, + "grad_norm": 0.39750751852989197, + "learning_rate": 0.00015951493287685788, + "loss": 0.3352021026611328, + "mean_token_accuracy": 0.8935402005910873, + "num_tokens": 3573158.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.4485042405128479, + "eval_loss": 0.5753094553947449, + "eval_mean_token_accuracy": 0.8450798985362052, + "eval_num_tokens": 3681748.0, + "eval_runtime": 49.9605, + "eval_samples_per_second": 31.985, + "eval_steps_per_second": 4.003, + "step": 1496 + }, + { + "entropy": 0.3735277107869736, + "epoch": 4.010709504685408, + "grad_norm": 0.41153082251548767, + "learning_rate": 0.00015537290839535005, + "loss": 0.327095947265625, + "mean_token_accuracy": 0.8959399853089843, + "num_tokens": 3691654.0, + "step": 1500 + }, + { + "entropy": 0.2689096394181252, + "epoch": 4.144578313253012, + "grad_norm": 0.5363789200782776, + "learning_rate": 0.00015111848116899814, + "loss": 0.2247480583190918, + "mean_token_accuracy": 0.9249986118078232, + "num_tokens": 3813863.0, + "step": 1550 + }, + { + "entropy": 0.27684757232666013, + "epoch": 4.278447121820616, + "grad_norm": 0.5589100122451782, + "learning_rate": 0.00014676091467021694, + "loss": 0.23430667877197264, + "mean_token_accuracy": 0.9212016260623932, + "num_tokens": 3942009.0, + "step": 1600 + }, + { + "entropy": 0.27285940989851953, + "epoch": 4.412315930388219, + "grad_norm": 0.4415719211101532, + "learning_rate": 0.00014230969694402636, + "loss": 0.23151195526123047, + "mean_token_accuracy": 0.922565575838089, + "num_tokens": 4067146.0, + "step": 1650 + }, + { + "entropy": 0.28027778953313826, + "epoch": 4.546184738955823, + "grad_norm": 0.544822096824646, + "learning_rate": 0.0001377745199490439, + "loss": 0.23426279067993164, + "mean_token_accuracy": 0.9214058065414429, + "num_tokens": 4186586.0, + "step": 1700 + }, + { + "entropy": 0.2855076715350151, + "epoch": 4.680053547523427, + "grad_norm": 0.47745293378829956, + "learning_rate": 0.00013316525845448153, + "loss": 0.2384078598022461, + "mean_token_accuracy": 0.9208149307966232, + "num_tokens": 4307001.0, + "step": 1750 + }, + { + "entropy": 0.28488670364022256, + "epoch": 4.813922356091031, + "grad_norm": 0.6087909936904907, + "learning_rate": 0.00012849194853909585, + "loss": 0.24047565460205078, + "mean_token_accuracy": 0.9198217475414276, + "num_tokens": 4429513.0, + "step": 1800 + }, + { + "entropy": 0.2799084801971912, + "epoch": 4.947791164658635, + "grad_norm": 0.4444660544395447, + "learning_rate": 0.00012376476573890707, + "loss": 0.23463037490844726, + "mean_token_accuracy": 0.9206935846805573, + "num_tokens": 4557562.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.3690616069734097, + "eval_loss": 0.6533966064453125, + "eval_mean_token_accuracy": 0.8432427588105201, + "eval_num_tokens": 4602185.0, + "eval_runtime": 49.893, + "eval_samples_per_second": 32.029, + "eval_steps_per_second": 4.009, + "step": 1870 + }, + { + "entropy": 0.2288022293436407, + "epoch": 5.080321285140562, + "grad_norm": 0.4002731442451477, + "learning_rate": 0.0001189940028912678, + "loss": 0.17887537002563478, + "mean_token_accuracy": 0.9409801201386885, + "num_tokens": 4679483.0, + "step": 1900 + }, + { + "entropy": 0.192287794649601, + "epoch": 5.214190093708166, + "grad_norm": 0.4603167474269867, + "learning_rate": 0.00011419004772352316, + "loss": 0.14474411010742189, + "mean_token_accuracy": 0.9518448287248611, + "num_tokens": 4800131.0, + "step": 1950 + }, + { + "entropy": 0.19072901770472528, + "epoch": 5.34805890227577, + "grad_norm": 0.6232134103775024, + "learning_rate": 0.00010936336023505987, + "loss": 0.14428988456726075, + "mean_token_accuracy": 0.9511868554353714, + "num_tokens": 4923382.0, + "step": 2000 + }, + { + "entropy": 0.19722454741597176, + "epoch": 5.481927710843373, + "grad_norm": 0.49368390440940857, + "learning_rate": 0.00010452444992199237, + "loss": 0.15026931762695311, + "mean_token_accuracy": 0.9493078935146332, + "num_tokens": 5042200.0, + "step": 2050 + }, + { + "entropy": 0.1921817621588707, + "epoch": 5.615796519410977, + "grad_norm": 0.6033351421356201, + "learning_rate": 9.9683852894076e-05, + "loss": 0.15000157356262206, + "mean_token_accuracy": 0.9497224026918412, + "num_tokens": 5168285.0, + "step": 2100 + }, + { + "entropy": 0.19455582827329634, + "epoch": 5.749665327978581, + "grad_norm": 0.4534570276737213, + "learning_rate": 9.485210893367247e-05, + "loss": 0.14963313102722167, + "mean_token_accuracy": 0.94916872382164, + "num_tokens": 5289880.0, + "step": 2150 + }, + { + "entropy": 0.18738240271806716, + "epoch": 5.883534136546185, + "grad_norm": 0.5815815329551697, + "learning_rate": 9.003973854671866e-05, + "loss": 0.14579124450683595, + "mean_token_accuracy": 0.9498835545778275, + "num_tokens": 5413325.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.29356860227882864, + "eval_loss": 0.7589722275733948, + "eval_mean_token_accuracy": 0.8418136316537858, + "eval_num_tokens": 5522622.0, + "eval_runtime": 49.8769, + "eval_samples_per_second": 32.039, + "eval_steps_per_second": 4.01, + "step": 2244 + }, + { + "entropy": 0.19056345296628546, + "epoch": 6.016064257028113, + "grad_norm": 0.29291921854019165, + "learning_rate": 8.525722005566732e-05, + "loss": 0.14140020370483397, + "mean_token_accuracy": 0.9524310213146787, + "num_tokens": 5536511.0, + "step": 2250 + }, + { + "entropy": 0.13221844218671322, + "epoch": 6.149933065595716, + "grad_norm": 0.34443414211273193, + "learning_rate": 8.051496678427703e-05, + "loss": 0.0891877555847168, + "mean_token_accuracy": 0.97141546189785, + "num_tokens": 5663054.0, + "step": 2300 + }, + { + "entropy": 0.13209220491349696, + "epoch": 6.28380187416332, + "grad_norm": 0.4142569899559021, + "learning_rate": 7.58233043839285e-05, + "loss": 0.08825708389282226, + "mean_token_accuracy": 0.9711007869243622, + "num_tokens": 5790379.0, + "step": 2350 + }, + { + "entropy": 0.14063000075519086, + "epoch": 6.417670682730924, + "grad_norm": 0.47584882378578186, + "learning_rate": 7.119244835083612e-05, + "loss": 0.09473857879638672, + "mean_token_accuracy": 0.9696242707967758, + "num_tokens": 5908852.0, + "step": 2400 + }, + { + "entropy": 0.13315705463290214, + "epoch": 6.551539491298527, + "grad_norm": 0.3179854452610016, + "learning_rate": 6.66324817831086e-05, + "loss": 0.0911135196685791, + "mean_token_accuracy": 0.9703826290369034, + "num_tokens": 6033966.0, + "step": 2450 + }, + { + "entropy": 0.13554719373583793, + "epoch": 6.685408299866131, + "grad_norm": 0.49769842624664307, + "learning_rate": 6.215333342608944e-05, + "loss": 0.09153086662292481, + "mean_token_accuracy": 0.9705063331127167, + "num_tokens": 6156278.0, + "step": 2500 + }, + { + "entropy": 0.13915603026747703, + "epoch": 6.8192771084337345, + "grad_norm": 0.467375785112381, + "learning_rate": 5.7764756053780784e-05, + "loss": 0.09427680969238281, + "mean_token_accuracy": 0.9695158433914185, + "num_tokens": 6276774.0, + "step": 2550 + }, + { + "entropy": 0.13692217327654363, + "epoch": 6.953145917001339, + "grad_norm": 0.36858609318733215, + "learning_rate": 5.3476305233422516e-05, + "loss": 0.09176054954528809, + "mean_token_accuracy": 0.969379341006279, + "num_tokens": 6401444.0, + "step": 2600 + }, + { + "epoch": 7.0, + "eval_entropy": 0.24440797246992588, + "eval_loss": 0.891926646232605, + "eval_mean_token_accuracy": 0.8400790172815323, + "eval_num_tokens": 6443059.0, + "eval_runtime": 49.9579, + "eval_samples_per_second": 31.987, + "eval_steps_per_second": 4.003, + "step": 2618 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.561128787588956e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a67d7b089767a1f6708064da5a07f65728eada3e --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_bias": false, + "lora_dropout": 0.0016857635936814886, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj", + "up_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..538149a07655e785730cc0a4f2d3cd515f54dc34 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/trainer_state.json @@ -0,0 +1,712 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.0, + "eval_steps": 500, + "global_step": 2992, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.3424121737480164, + "epoch": 0.13386880856760375, + "grad_norm": 1.1671011447906494, + "learning_rate": 2.7185774847148058e-05, + "loss": 1.2469227600097657, + "mean_token_accuracy": 0.7243366748094558, + "num_tokens": 122643.0, + "step": 50 + }, + { + "entropy": 0.6559184691309929, + "epoch": 0.2677376171352075, + "grad_norm": 0.8399614691734314, + "learning_rate": 5.4926361425870564e-05, + "loss": 0.6245294189453126, + "mean_token_accuracy": 0.8268856984376908, + "num_tokens": 245702.0, + "step": 100 + }, + { + "entropy": 0.6082554489374161, + "epoch": 0.40160642570281124, + "grad_norm": 0.5691282749176025, + "learning_rate": 8.266694800459306e-05, + "loss": 0.5804204177856446, + "mean_token_accuracy": 0.8374843555688858, + "num_tokens": 371834.0, + "step": 150 + }, + { + "entropy": 0.5805956655740738, + "epoch": 0.535475234270415, + "grad_norm": 0.45651012659072876, + "learning_rate": 0.00011040753458331558, + "loss": 0.5495451354980468, + "mean_token_accuracy": 0.8450068402290344, + "num_tokens": 500948.0, + "step": 200 + }, + { + "entropy": 0.5810796636343002, + "epoch": 0.6693440428380187, + "grad_norm": 0.493431955575943, + "learning_rate": 0.00013814812116203808, + "loss": 0.5546633911132812, + "mean_token_accuracy": 0.8431127589941024, + "num_tokens": 621874.0, + "step": 250 + }, + { + "entropy": 0.5683389616012573, + "epoch": 0.8032128514056225, + "grad_norm": 0.4698493182659149, + "learning_rate": 0.00016588870774076058, + "loss": 0.5354468536376953, + "mean_token_accuracy": 0.8477097982168198, + "num_tokens": 746138.0, + "step": 300 + }, + { + "entropy": 0.5579970148205757, + "epoch": 0.9370816599732262, + "grad_norm": 0.5637578964233398, + "learning_rate": 0.0001936292943194831, + "loss": 0.5262400054931641, + "mean_token_accuracy": 0.8487933957576752, + "num_tokens": 868331.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5953671643137932, + "eval_loss": 0.5836789608001709, + "eval_mean_token_accuracy": 0.8373425653576851, + "eval_num_tokens": 920437.0, + "eval_runtime": 49.8765, + "eval_samples_per_second": 32.039, + "eval_steps_per_second": 4.01, + "step": 374 + }, + { + "entropy": 0.5491663054986433, + "epoch": 1.069611780455154, + "grad_norm": 0.6051601767539978, + "learning_rate": 0.0002074713460228683, + "loss": 0.5132473373413086, + "mean_token_accuracy": 0.8526828770685677, + "num_tokens": 980838.0, + "step": 400 + }, + { + "entropy": 0.5334427100419998, + "epoch": 1.2034805890227578, + "grad_norm": 0.4304046630859375, + "learning_rate": 0.00020724550557791978, + "loss": 0.497388916015625, + "mean_token_accuracy": 0.8541187030076981, + "num_tokens": 1104763.0, + "step": 450 + }, + { + "entropy": 0.5297759872674942, + "epoch": 1.3373493975903614, + "grad_norm": 0.4243815541267395, + "learning_rate": 0.00020679431642677408, + "loss": 0.49724563598632815, + "mean_token_accuracy": 0.8557562667131424, + "num_tokens": 1230459.0, + "step": 500 + }, + { + "entropy": 0.5287212440371514, + "epoch": 1.4712182061579653, + "grad_norm": 0.5970085859298706, + "learning_rate": 0.0002061187609762355, + "loss": 0.4903334808349609, + "mean_token_accuracy": 0.8566980129480362, + "num_tokens": 1356368.0, + "step": 550 + }, + { + "entropy": 0.5196757692098618, + "epoch": 1.605087014725569, + "grad_norm": 0.353085458278656, + "learning_rate": 0.00020522031016209576, + "loss": 0.48056564331054685, + "mean_token_accuracy": 0.8591135066747665, + "num_tokens": 1484569.0, + "step": 600 + }, + { + "entropy": 0.5124905353784561, + "epoch": 1.7389558232931726, + "grad_norm": 0.383682519197464, + "learning_rate": 0.00020410092024635923, + "loss": 0.47599597930908205, + "mean_token_accuracy": 0.8606968414783478, + "num_tokens": 1609962.0, + "step": 650 + }, + { + "entropy": 0.5109778612852096, + "epoch": 1.8728246318607764, + "grad_norm": 0.35959598422050476, + "learning_rate": 0.00020276302855773176, + "loss": 0.47350929260253904, + "mean_token_accuracy": 0.862987876534462, + "num_tokens": 1729667.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.5806624293327332, + "eval_loss": 0.5672553181648254, + "eval_mean_token_accuracy": 0.8352122050523758, + "eval_num_tokens": 1840874.0, + "eval_runtime": 49.8669, + "eval_samples_per_second": 32.045, + "eval_steps_per_second": 4.011, + "step": 748 + }, + { + "entropy": 0.5177121743409321, + "epoch": 2.005354752342704, + "grad_norm": 0.33286598324775696, + "learning_rate": 0.00020120954818464854, + "loss": 0.4759817886352539, + "mean_token_accuracy": 0.8611076672871908, + "num_tokens": 1846410.0, + "step": 750 + }, + { + "entropy": 0.43643771946430204, + "epoch": 2.139223560910308, + "grad_norm": 0.3718855679035187, + "learning_rate": 0.00019944386163239588, + "loss": 0.3936069107055664, + "mean_token_accuracy": 0.8780064672231674, + "num_tokens": 1971547.0, + "step": 800 + }, + { + "entropy": 0.44890178814530374, + "epoch": 2.2730923694779115, + "grad_norm": 0.4402499496936798, + "learning_rate": 0.0001974698134581373, + "loss": 0.40427154541015625, + "mean_token_accuracy": 0.8763552361726761, + "num_tokens": 2088560.0, + "step": 850 + }, + { + "entropy": 0.44539201706647874, + "epoch": 2.4069611780455156, + "grad_norm": 0.437489777803421, + "learning_rate": 0.00019529170189988115, + "loss": 0.4049137878417969, + "mean_token_accuracy": 0.8759620261192321, + "num_tokens": 2210146.0, + "step": 900 + }, + { + "entropy": 0.4456777948141098, + "epoch": 2.540829986613119, + "grad_norm": 0.5221232175827026, + "learning_rate": 0.0001929142695176156, + "loss": 0.4049659729003906, + "mean_token_accuracy": 0.8767672145366668, + "num_tokens": 2331354.0, + "step": 950 + }, + { + "entropy": 0.44445386946201326, + "epoch": 2.674698795180723, + "grad_norm": 0.48457658290863037, + "learning_rate": 0.00019034269286698953, + "loss": 0.4065634536743164, + "mean_token_accuracy": 0.8764267575740814, + "num_tokens": 2452558.0, + "step": 1000 + }, + { + "entropy": 0.44160154819488523, + "epoch": 2.8085676037483265, + "grad_norm": 0.36902090907096863, + "learning_rate": 0.00018758257122802307, + "loss": 0.4023736953735352, + "mean_token_accuracy": 0.8786762475967407, + "num_tokens": 2582428.0, + "step": 1050 + }, + { + "entropy": 0.4473246121406555, + "epoch": 2.9424364123159306, + "grad_norm": 0.3760707676410675, + "learning_rate": 0.00018463991441338993, + "loss": 0.40938362121582034, + "mean_token_accuracy": 0.8754651814699173, + "num_tokens": 2707532.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.4759794683754444, + "eval_loss": 0.5672900676727295, + "eval_mean_token_accuracy": 0.843816783130169, + "eval_num_tokens": 2761311.0, + "eval_runtime": 49.9422, + "eval_samples_per_second": 31.997, + "eval_steps_per_second": 4.005, + "step": 1122 + }, + { + "entropy": 0.3893387094892637, + "epoch": 3.074966532797858, + "grad_norm": 0.4708668291568756, + "learning_rate": 0.00018152112968281706, + "loss": 0.3442184829711914, + "mean_token_accuracy": 0.8925870771359916, + "num_tokens": 2834344.0, + "step": 1150 + }, + { + "entropy": 0.36019587606191633, + "epoch": 3.208835341365462, + "grad_norm": 0.49764320254325867, + "learning_rate": 0.00017823300779209423, + "loss": 0.3140977668762207, + "mean_token_accuracy": 0.8975517880916596, + "num_tokens": 2952748.0, + "step": 1200 + }, + { + "entropy": 0.3615420612692833, + "epoch": 3.3427041499330654, + "grad_norm": 0.49308347702026367, + "learning_rate": 0.0001747827082070698, + "loss": 0.31728214263916016, + "mean_token_accuracy": 0.8988397961854935, + "num_tokens": 3080333.0, + "step": 1250 + }, + { + "entropy": 0.36845425054430964, + "epoch": 3.4765729585006695, + "grad_norm": 0.47647032141685486, + "learning_rate": 0.00017117774351482735, + "loss": 0.3203315734863281, + "mean_token_accuracy": 0.8968151319026947, + "num_tokens": 3206920.0, + "step": 1300 + }, + { + "entropy": 0.37361170917749403, + "epoch": 3.610441767068273, + "grad_norm": 0.5282555818557739, + "learning_rate": 0.000167425963065986, + "loss": 0.32945499420166013, + "mean_token_accuracy": 0.8949814730882645, + "num_tokens": 3327931.0, + "step": 1350 + }, + { + "entropy": 0.3739692223072052, + "epoch": 3.7443105756358768, + "grad_norm": 0.41107845306396484, + "learning_rate": 0.00016353553588374095, + "loss": 0.32604251861572264, + "mean_token_accuracy": 0.896143769621849, + "num_tokens": 3443874.0, + "step": 1400 + }, + { + "entropy": 0.37787127375602725, + "epoch": 3.878179384203481, + "grad_norm": 0.39750751852989197, + "learning_rate": 0.00015951493287685788, + "loss": 0.3352021026611328, + "mean_token_accuracy": 0.8935402005910873, + "num_tokens": 3573158.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.4485042405128479, + "eval_loss": 0.5753094553947449, + "eval_mean_token_accuracy": 0.8450798985362052, + "eval_num_tokens": 3681748.0, + "eval_runtime": 49.9605, + "eval_samples_per_second": 31.985, + "eval_steps_per_second": 4.003, + "step": 1496 + }, + { + "entropy": 0.3735277107869736, + "epoch": 4.010709504685408, + "grad_norm": 0.41153082251548767, + "learning_rate": 0.00015537290839535005, + "loss": 0.327095947265625, + "mean_token_accuracy": 0.8959399853089843, + "num_tokens": 3691654.0, + "step": 1500 + }, + { + "entropy": 0.2689096394181252, + "epoch": 4.144578313253012, + "grad_norm": 0.5363789200782776, + "learning_rate": 0.00015111848116899814, + "loss": 0.2247480583190918, + "mean_token_accuracy": 0.9249986118078232, + "num_tokens": 3813863.0, + "step": 1550 + }, + { + "entropy": 0.27684757232666013, + "epoch": 4.278447121820616, + "grad_norm": 0.5589100122451782, + "learning_rate": 0.00014676091467021694, + "loss": 0.23430667877197264, + "mean_token_accuracy": 0.9212016260623932, + "num_tokens": 3942009.0, + "step": 1600 + }, + { + "entropy": 0.27285940989851953, + "epoch": 4.412315930388219, + "grad_norm": 0.4415719211101532, + "learning_rate": 0.00014230969694402636, + "loss": 0.23151195526123047, + "mean_token_accuracy": 0.922565575838089, + "num_tokens": 4067146.0, + "step": 1650 + }, + { + "entropy": 0.28027778953313826, + "epoch": 4.546184738955823, + "grad_norm": 0.544822096824646, + "learning_rate": 0.0001377745199490439, + "loss": 0.23426279067993164, + "mean_token_accuracy": 0.9214058065414429, + "num_tokens": 4186586.0, + "step": 1700 + }, + { + "entropy": 0.2855076715350151, + "epoch": 4.680053547523427, + "grad_norm": 0.47745293378829956, + "learning_rate": 0.00013316525845448153, + "loss": 0.2384078598022461, + "mean_token_accuracy": 0.9208149307966232, + "num_tokens": 4307001.0, + "step": 1750 + }, + { + "entropy": 0.28488670364022256, + "epoch": 4.813922356091031, + "grad_norm": 0.6087909936904907, + "learning_rate": 0.00012849194853909585, + "loss": 0.24047565460205078, + "mean_token_accuracy": 0.9198217475414276, + "num_tokens": 4429513.0, + "step": 1800 + }, + { + "entropy": 0.2799084801971912, + "epoch": 4.947791164658635, + "grad_norm": 0.4444660544395447, + "learning_rate": 0.00012376476573890707, + "loss": 0.23463037490844726, + "mean_token_accuracy": 0.9206935846805573, + "num_tokens": 4557562.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.3690616069734097, + "eval_loss": 0.6533966064453125, + "eval_mean_token_accuracy": 0.8432427588105201, + "eval_num_tokens": 4602185.0, + "eval_runtime": 49.893, + "eval_samples_per_second": 32.029, + "eval_steps_per_second": 4.009, + "step": 1870 + }, + { + "entropy": 0.2288022293436407, + "epoch": 5.080321285140562, + "grad_norm": 0.4002731442451477, + "learning_rate": 0.0001189940028912678, + "loss": 0.17887537002563478, + "mean_token_accuracy": 0.9409801201386885, + "num_tokens": 4679483.0, + "step": 1900 + }, + { + "entropy": 0.192287794649601, + "epoch": 5.214190093708166, + "grad_norm": 0.4603167474269867, + "learning_rate": 0.00011419004772352316, + "loss": 0.14474411010742189, + "mean_token_accuracy": 0.9518448287248611, + "num_tokens": 4800131.0, + "step": 1950 + }, + { + "entropy": 0.19072901770472528, + "epoch": 5.34805890227577, + "grad_norm": 0.6232134103775024, + "learning_rate": 0.00010936336023505987, + "loss": 0.14428988456726075, + "mean_token_accuracy": 0.9511868554353714, + "num_tokens": 4923382.0, + "step": 2000 + }, + { + "entropy": 0.19722454741597176, + "epoch": 5.481927710843373, + "grad_norm": 0.49368390440940857, + "learning_rate": 0.00010452444992199237, + "loss": 0.15026931762695311, + "mean_token_accuracy": 0.9493078935146332, + "num_tokens": 5042200.0, + "step": 2050 + }, + { + "entropy": 0.1921817621588707, + "epoch": 5.615796519410977, + "grad_norm": 0.6033351421356201, + "learning_rate": 9.9683852894076e-05, + "loss": 0.15000157356262206, + "mean_token_accuracy": 0.9497224026918412, + "num_tokens": 5168285.0, + "step": 2100 + }, + { + "entropy": 0.19455582827329634, + "epoch": 5.749665327978581, + "grad_norm": 0.4534570276737213, + "learning_rate": 9.485210893367247e-05, + "loss": 0.14963313102722167, + "mean_token_accuracy": 0.94916872382164, + "num_tokens": 5289880.0, + "step": 2150 + }, + { + "entropy": 0.18738240271806716, + "epoch": 5.883534136546185, + "grad_norm": 0.5815815329551697, + "learning_rate": 9.003973854671866e-05, + "loss": 0.14579124450683595, + "mean_token_accuracy": 0.9498835545778275, + "num_tokens": 5413325.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.29356860227882864, + "eval_loss": 0.7589722275733948, + "eval_mean_token_accuracy": 0.8418136316537858, + "eval_num_tokens": 5522622.0, + "eval_runtime": 49.8769, + "eval_samples_per_second": 32.039, + "eval_steps_per_second": 4.01, + "step": 2244 + }, + { + "entropy": 0.19056345296628546, + "epoch": 6.016064257028113, + "grad_norm": 0.29291921854019165, + "learning_rate": 8.525722005566732e-05, + "loss": 0.14140020370483397, + "mean_token_accuracy": 0.9524310213146787, + "num_tokens": 5536511.0, + "step": 2250 + }, + { + "entropy": 0.13221844218671322, + "epoch": 6.149933065595716, + "grad_norm": 0.34443414211273193, + "learning_rate": 8.051496678427703e-05, + "loss": 0.0891877555847168, + "mean_token_accuracy": 0.97141546189785, + "num_tokens": 5663054.0, + "step": 2300 + }, + { + "entropy": 0.13209220491349696, + "epoch": 6.28380187416332, + "grad_norm": 0.4142569899559021, + "learning_rate": 7.58233043839285e-05, + "loss": 0.08825708389282226, + "mean_token_accuracy": 0.9711007869243622, + "num_tokens": 5790379.0, + "step": 2350 + }, + { + "entropy": 0.14063000075519086, + "epoch": 6.417670682730924, + "grad_norm": 0.47584882378578186, + "learning_rate": 7.119244835083612e-05, + "loss": 0.09473857879638672, + "mean_token_accuracy": 0.9696242707967758, + "num_tokens": 5908852.0, + "step": 2400 + }, + { + "entropy": 0.13315705463290214, + "epoch": 6.551539491298527, + "grad_norm": 0.3179854452610016, + "learning_rate": 6.66324817831086e-05, + "loss": 0.0911135196685791, + "mean_token_accuracy": 0.9703826290369034, + "num_tokens": 6033966.0, + "step": 2450 + }, + { + "entropy": 0.13554719373583793, + "epoch": 6.685408299866131, + "grad_norm": 0.49769842624664307, + "learning_rate": 6.215333342608944e-05, + "loss": 0.09153086662292481, + "mean_token_accuracy": 0.9705063331127167, + "num_tokens": 6156278.0, + "step": 2500 + }, + { + "entropy": 0.13915603026747703, + "epoch": 6.8192771084337345, + "grad_norm": 0.467375785112381, + "learning_rate": 5.7764756053780784e-05, + "loss": 0.09427680969238281, + "mean_token_accuracy": 0.9695158433914185, + "num_tokens": 6276774.0, + "step": 2550 + }, + { + "entropy": 0.13692217327654363, + "epoch": 6.953145917001339, + "grad_norm": 0.36858609318733215, + "learning_rate": 5.3476305233422516e-05, + "loss": 0.09176054954528809, + "mean_token_accuracy": 0.969379341006279, + "num_tokens": 6401444.0, + "step": 2600 + }, + { + "epoch": 7.0, + "eval_entropy": 0.24440797246992588, + "eval_loss": 0.891926646232605, + "eval_mean_token_accuracy": 0.8400790172815323, + "eval_num_tokens": 6443059.0, + "eval_runtime": 49.9579, + "eval_samples_per_second": 31.987, + "eval_steps_per_second": 4.003, + "step": 2618 + }, + { + "entropy": 0.1212329932234504, + "epoch": 7.085676037483267, + "grad_norm": 0.2522813677787781, + "learning_rate": 4.929731851946405e-05, + "loss": 0.07568974018096924, + "mean_token_accuracy": 0.975432159924748, + "num_tokens": 6525142.0, + "step": 2650 + }, + { + "entropy": 0.10857273273169994, + "epoch": 7.21954484605087, + "grad_norm": 0.2335294634103775, + "learning_rate": 4.5236895122230764e-05, + "loss": 0.06618132591247558, + "mean_token_accuracy": 0.9785620093345642, + "num_tokens": 6651930.0, + "step": 2700 + }, + { + "entropy": 0.1136517857015133, + "epoch": 7.353413654618474, + "grad_norm": 0.3027023375034332, + "learning_rate": 4.130387609555471e-05, + "loss": 0.06987609386444092, + "mean_token_accuracy": 0.9772803634405136, + "num_tokens": 6768469.0, + "step": 2750 + }, + { + "entropy": 0.10837352603673935, + "epoch": 7.4872824631860775, + "grad_norm": 0.3191539943218231, + "learning_rate": 3.750682508650807e-05, + "loss": 0.06725080013275146, + "mean_token_accuracy": 0.9786273115873336, + "num_tokens": 6892532.0, + "step": 2800 + }, + { + "entropy": 0.10379995822906495, + "epoch": 7.621151271753681, + "grad_norm": 0.21721133589744568, + "learning_rate": 3.3854009689154384e-05, + "loss": 0.06510573387145996, + "mean_token_accuracy": 0.9790040755271912, + "num_tokens": 7023373.0, + "step": 2850 + }, + { + "entropy": 0.10860319800674915, + "epoch": 7.755020080321285, + "grad_norm": 0.35063880681991577, + "learning_rate": 3.0353383442917245e-05, + "loss": 0.06781518936157227, + "mean_token_accuracy": 0.9782285010814666, + "num_tokens": 7146448.0, + "step": 2900 + }, + { + "entropy": 0.11041728757321835, + "epoch": 7.888888888888889, + "grad_norm": 0.27241161465644836, + "learning_rate": 2.7012568514763283e-05, + "loss": 0.06919246673583984, + "mean_token_accuracy": 0.9774098896980286, + "num_tokens": 7267680.0, + "step": 2950 + }, + { + "epoch": 8.0, + "eval_entropy": 0.216568651124835, + "eval_loss": 1.0052591562271118, + "eval_mean_token_accuracy": 0.841527444422245, + "eval_num_tokens": 7363496.0, + "eval_runtime": 49.8931, + "eval_samples_per_second": 32.028, + "eval_steps_per_second": 4.009, + "step": 2992 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.0686807864376934e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a67d7b089767a1f6708064da5a07f65728eada3e --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_bias": false, + "lora_dropout": 0.0016857635936814886, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj", + "up_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fdbeb61a6495ab85e0b69bb9efab68059de8baaf --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/trainer_state.json @@ -0,0 +1,803 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.0, + "eval_steps": 500, + "global_step": 3366, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.3424121737480164, + "epoch": 0.13386880856760375, + "grad_norm": 1.1671011447906494, + "learning_rate": 2.7185774847148058e-05, + "loss": 1.2469227600097657, + "mean_token_accuracy": 0.7243366748094558, + "num_tokens": 122643.0, + "step": 50 + }, + { + "entropy": 0.6559184691309929, + "epoch": 0.2677376171352075, + "grad_norm": 0.8399614691734314, + "learning_rate": 5.4926361425870564e-05, + "loss": 0.6245294189453126, + "mean_token_accuracy": 0.8268856984376908, + "num_tokens": 245702.0, + "step": 100 + }, + { + "entropy": 0.6082554489374161, + "epoch": 0.40160642570281124, + "grad_norm": 0.5691282749176025, + "learning_rate": 8.266694800459306e-05, + "loss": 0.5804204177856446, + "mean_token_accuracy": 0.8374843555688858, + "num_tokens": 371834.0, + "step": 150 + }, + { + "entropy": 0.5805956655740738, + "epoch": 0.535475234270415, + "grad_norm": 0.45651012659072876, + "learning_rate": 0.00011040753458331558, + "loss": 0.5495451354980468, + "mean_token_accuracy": 0.8450068402290344, + "num_tokens": 500948.0, + "step": 200 + }, + { + "entropy": 0.5810796636343002, + "epoch": 0.6693440428380187, + "grad_norm": 0.493431955575943, + "learning_rate": 0.00013814812116203808, + "loss": 0.5546633911132812, + "mean_token_accuracy": 0.8431127589941024, + "num_tokens": 621874.0, + "step": 250 + }, + { + "entropy": 0.5683389616012573, + "epoch": 0.8032128514056225, + "grad_norm": 0.4698493182659149, + "learning_rate": 0.00016588870774076058, + "loss": 0.5354468536376953, + "mean_token_accuracy": 0.8477097982168198, + "num_tokens": 746138.0, + "step": 300 + }, + { + "entropy": 0.5579970148205757, + "epoch": 0.9370816599732262, + "grad_norm": 0.5637578964233398, + "learning_rate": 0.0001936292943194831, + "loss": 0.5262400054931641, + "mean_token_accuracy": 0.8487933957576752, + "num_tokens": 868331.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5953671643137932, + "eval_loss": 0.5836789608001709, + "eval_mean_token_accuracy": 0.8373425653576851, + "eval_num_tokens": 920437.0, + "eval_runtime": 49.8765, + "eval_samples_per_second": 32.039, + "eval_steps_per_second": 4.01, + "step": 374 + }, + { + "entropy": 0.5491663054986433, + "epoch": 1.069611780455154, + "grad_norm": 0.6051601767539978, + "learning_rate": 0.0002074713460228683, + "loss": 0.5132473373413086, + "mean_token_accuracy": 0.8526828770685677, + "num_tokens": 980838.0, + "step": 400 + }, + { + "entropy": 0.5334427100419998, + "epoch": 1.2034805890227578, + "grad_norm": 0.4304046630859375, + "learning_rate": 0.00020724550557791978, + "loss": 0.497388916015625, + "mean_token_accuracy": 0.8541187030076981, + "num_tokens": 1104763.0, + "step": 450 + }, + { + "entropy": 0.5297759872674942, + "epoch": 1.3373493975903614, + "grad_norm": 0.4243815541267395, + "learning_rate": 0.00020679431642677408, + "loss": 0.49724563598632815, + "mean_token_accuracy": 0.8557562667131424, + "num_tokens": 1230459.0, + "step": 500 + }, + { + "entropy": 0.5287212440371514, + "epoch": 1.4712182061579653, + "grad_norm": 0.5970085859298706, + "learning_rate": 0.0002061187609762355, + "loss": 0.4903334808349609, + "mean_token_accuracy": 0.8566980129480362, + "num_tokens": 1356368.0, + "step": 550 + }, + { + "entropy": 0.5196757692098618, + "epoch": 1.605087014725569, + "grad_norm": 0.353085458278656, + "learning_rate": 0.00020522031016209576, + "loss": 0.48056564331054685, + "mean_token_accuracy": 0.8591135066747665, + "num_tokens": 1484569.0, + "step": 600 + }, + { + "entropy": 0.5124905353784561, + "epoch": 1.7389558232931726, + "grad_norm": 0.383682519197464, + "learning_rate": 0.00020410092024635923, + "loss": 0.47599597930908205, + "mean_token_accuracy": 0.8606968414783478, + "num_tokens": 1609962.0, + "step": 650 + }, + { + "entropy": 0.5109778612852096, + "epoch": 1.8728246318607764, + "grad_norm": 0.35959598422050476, + "learning_rate": 0.00020276302855773176, + "loss": 0.47350929260253904, + "mean_token_accuracy": 0.862987876534462, + "num_tokens": 1729667.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.5806624293327332, + "eval_loss": 0.5672553181648254, + "eval_mean_token_accuracy": 0.8352122050523758, + "eval_num_tokens": 1840874.0, + "eval_runtime": 49.8669, + "eval_samples_per_second": 32.045, + "eval_steps_per_second": 4.011, + "step": 748 + }, + { + "entropy": 0.5177121743409321, + "epoch": 2.005354752342704, + "grad_norm": 0.33286598324775696, + "learning_rate": 0.00020120954818464854, + "loss": 0.4759817886352539, + "mean_token_accuracy": 0.8611076672871908, + "num_tokens": 1846410.0, + "step": 750 + }, + { + "entropy": 0.43643771946430204, + "epoch": 2.139223560910308, + "grad_norm": 0.3718855679035187, + "learning_rate": 0.00019944386163239588, + "loss": 0.3936069107055664, + "mean_token_accuracy": 0.8780064672231674, + "num_tokens": 1971547.0, + "step": 800 + }, + { + "entropy": 0.44890178814530374, + "epoch": 2.2730923694779115, + "grad_norm": 0.4402499496936798, + "learning_rate": 0.0001974698134581373, + "loss": 0.40427154541015625, + "mean_token_accuracy": 0.8763552361726761, + "num_tokens": 2088560.0, + "step": 850 + }, + { + "entropy": 0.44539201706647874, + "epoch": 2.4069611780455156, + "grad_norm": 0.437489777803421, + "learning_rate": 0.00019529170189988115, + "loss": 0.4049137878417969, + "mean_token_accuracy": 0.8759620261192321, + "num_tokens": 2210146.0, + "step": 900 + }, + { + "entropy": 0.4456777948141098, + "epoch": 2.540829986613119, + "grad_norm": 0.5221232175827026, + "learning_rate": 0.0001929142695176156, + "loss": 0.4049659729003906, + "mean_token_accuracy": 0.8767672145366668, + "num_tokens": 2331354.0, + "step": 950 + }, + { + "entropy": 0.44445386946201326, + "epoch": 2.674698795180723, + "grad_norm": 0.48457658290863037, + "learning_rate": 0.00019034269286698953, + "loss": 0.4065634536743164, + "mean_token_accuracy": 0.8764267575740814, + "num_tokens": 2452558.0, + "step": 1000 + }, + { + "entropy": 0.44160154819488523, + "epoch": 2.8085676037483265, + "grad_norm": 0.36902090907096863, + "learning_rate": 0.00018758257122802307, + "loss": 0.4023736953735352, + "mean_token_accuracy": 0.8786762475967407, + "num_tokens": 2582428.0, + "step": 1050 + }, + { + "entropy": 0.4473246121406555, + "epoch": 2.9424364123159306, + "grad_norm": 0.3760707676410675, + "learning_rate": 0.00018463991441338993, + "loss": 0.40938362121582034, + "mean_token_accuracy": 0.8754651814699173, + "num_tokens": 2707532.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.4759794683754444, + "eval_loss": 0.5672900676727295, + "eval_mean_token_accuracy": 0.843816783130169, + "eval_num_tokens": 2761311.0, + "eval_runtime": 49.9422, + "eval_samples_per_second": 31.997, + "eval_steps_per_second": 4.005, + "step": 1122 + }, + { + "entropy": 0.3893387094892637, + "epoch": 3.074966532797858, + "grad_norm": 0.4708668291568756, + "learning_rate": 0.00018152112968281706, + "loss": 0.3442184829711914, + "mean_token_accuracy": 0.8925870771359916, + "num_tokens": 2834344.0, + "step": 1150 + }, + { + "entropy": 0.36019587606191633, + "epoch": 3.208835341365462, + "grad_norm": 0.49764320254325867, + "learning_rate": 0.00017823300779209423, + "loss": 0.3140977668762207, + "mean_token_accuracy": 0.8975517880916596, + "num_tokens": 2952748.0, + "step": 1200 + }, + { + "entropy": 0.3615420612692833, + "epoch": 3.3427041499330654, + "grad_norm": 0.49308347702026367, + "learning_rate": 0.0001747827082070698, + "loss": 0.31728214263916016, + "mean_token_accuracy": 0.8988397961854935, + "num_tokens": 3080333.0, + "step": 1250 + }, + { + "entropy": 0.36845425054430964, + "epoch": 3.4765729585006695, + "grad_norm": 0.47647032141685486, + "learning_rate": 0.00017117774351482735, + "loss": 0.3203315734863281, + "mean_token_accuracy": 0.8968151319026947, + "num_tokens": 3206920.0, + "step": 1300 + }, + { + "entropy": 0.37361170917749403, + "epoch": 3.610441767068273, + "grad_norm": 0.5282555818557739, + "learning_rate": 0.000167425963065986, + "loss": 0.32945499420166013, + "mean_token_accuracy": 0.8949814730882645, + "num_tokens": 3327931.0, + "step": 1350 + }, + { + "entropy": 0.3739692223072052, + "epoch": 3.7443105756358768, + "grad_norm": 0.41107845306396484, + "learning_rate": 0.00016353553588374095, + "loss": 0.32604251861572264, + "mean_token_accuracy": 0.896143769621849, + "num_tokens": 3443874.0, + "step": 1400 + }, + { + "entropy": 0.37787127375602725, + "epoch": 3.878179384203481, + "grad_norm": 0.39750751852989197, + "learning_rate": 0.00015951493287685788, + "loss": 0.3352021026611328, + "mean_token_accuracy": 0.8935402005910873, + "num_tokens": 3573158.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.4485042405128479, + "eval_loss": 0.5753094553947449, + "eval_mean_token_accuracy": 0.8450798985362052, + "eval_num_tokens": 3681748.0, + "eval_runtime": 49.9605, + "eval_samples_per_second": 31.985, + "eval_steps_per_second": 4.003, + "step": 1496 + }, + { + "entropy": 0.3735277107869736, + "epoch": 4.010709504685408, + "grad_norm": 0.41153082251548767, + "learning_rate": 0.00015537290839535005, + "loss": 0.327095947265625, + "mean_token_accuracy": 0.8959399853089843, + "num_tokens": 3691654.0, + "step": 1500 + }, + { + "entropy": 0.2689096394181252, + "epoch": 4.144578313253012, + "grad_norm": 0.5363789200782776, + "learning_rate": 0.00015111848116899814, + "loss": 0.2247480583190918, + "mean_token_accuracy": 0.9249986118078232, + "num_tokens": 3813863.0, + "step": 1550 + }, + { + "entropy": 0.27684757232666013, + "epoch": 4.278447121820616, + "grad_norm": 0.5589100122451782, + "learning_rate": 0.00014676091467021694, + "loss": 0.23430667877197264, + "mean_token_accuracy": 0.9212016260623932, + "num_tokens": 3942009.0, + "step": 1600 + }, + { + "entropy": 0.27285940989851953, + "epoch": 4.412315930388219, + "grad_norm": 0.4415719211101532, + "learning_rate": 0.00014230969694402636, + "loss": 0.23151195526123047, + "mean_token_accuracy": 0.922565575838089, + "num_tokens": 4067146.0, + "step": 1650 + }, + { + "entropy": 0.28027778953313826, + "epoch": 4.546184738955823, + "grad_norm": 0.544822096824646, + "learning_rate": 0.0001377745199490439, + "loss": 0.23426279067993164, + "mean_token_accuracy": 0.9214058065414429, + "num_tokens": 4186586.0, + "step": 1700 + }, + { + "entropy": 0.2855076715350151, + "epoch": 4.680053547523427, + "grad_norm": 0.47745293378829956, + "learning_rate": 0.00013316525845448153, + "loss": 0.2384078598022461, + "mean_token_accuracy": 0.9208149307966232, + "num_tokens": 4307001.0, + "step": 1750 + }, + { + "entropy": 0.28488670364022256, + "epoch": 4.813922356091031, + "grad_norm": 0.6087909936904907, + "learning_rate": 0.00012849194853909585, + "loss": 0.24047565460205078, + "mean_token_accuracy": 0.9198217475414276, + "num_tokens": 4429513.0, + "step": 1800 + }, + { + "entropy": 0.2799084801971912, + "epoch": 4.947791164658635, + "grad_norm": 0.4444660544395447, + "learning_rate": 0.00012376476573890707, + "loss": 0.23463037490844726, + "mean_token_accuracy": 0.9206935846805573, + "num_tokens": 4557562.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.3690616069734097, + "eval_loss": 0.6533966064453125, + "eval_mean_token_accuracy": 0.8432427588105201, + "eval_num_tokens": 4602185.0, + "eval_runtime": 49.893, + "eval_samples_per_second": 32.029, + "eval_steps_per_second": 4.009, + "step": 1870 + }, + { + "entropy": 0.2288022293436407, + "epoch": 5.080321285140562, + "grad_norm": 0.4002731442451477, + "learning_rate": 0.0001189940028912678, + "loss": 0.17887537002563478, + "mean_token_accuracy": 0.9409801201386885, + "num_tokens": 4679483.0, + "step": 1900 + }, + { + "entropy": 0.192287794649601, + "epoch": 5.214190093708166, + "grad_norm": 0.4603167474269867, + "learning_rate": 0.00011419004772352316, + "loss": 0.14474411010742189, + "mean_token_accuracy": 0.9518448287248611, + "num_tokens": 4800131.0, + "step": 1950 + }, + { + "entropy": 0.19072901770472528, + "epoch": 5.34805890227577, + "grad_norm": 0.6232134103775024, + "learning_rate": 0.00010936336023505987, + "loss": 0.14428988456726075, + "mean_token_accuracy": 0.9511868554353714, + "num_tokens": 4923382.0, + "step": 2000 + }, + { + "entropy": 0.19722454741597176, + "epoch": 5.481927710843373, + "grad_norm": 0.49368390440940857, + "learning_rate": 0.00010452444992199237, + "loss": 0.15026931762695311, + "mean_token_accuracy": 0.9493078935146332, + "num_tokens": 5042200.0, + "step": 2050 + }, + { + "entropy": 0.1921817621588707, + "epoch": 5.615796519410977, + "grad_norm": 0.6033351421356201, + "learning_rate": 9.9683852894076e-05, + "loss": 0.15000157356262206, + "mean_token_accuracy": 0.9497224026918412, + "num_tokens": 5168285.0, + "step": 2100 + }, + { + "entropy": 0.19455582827329634, + "epoch": 5.749665327978581, + "grad_norm": 0.4534570276737213, + "learning_rate": 9.485210893367247e-05, + "loss": 0.14963313102722167, + "mean_token_accuracy": 0.94916872382164, + "num_tokens": 5289880.0, + "step": 2150 + }, + { + "entropy": 0.18738240271806716, + "epoch": 5.883534136546185, + "grad_norm": 0.5815815329551697, + "learning_rate": 9.003973854671866e-05, + "loss": 0.14579124450683595, + "mean_token_accuracy": 0.9498835545778275, + "num_tokens": 5413325.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.29356860227882864, + "eval_loss": 0.7589722275733948, + "eval_mean_token_accuracy": 0.8418136316537858, + "eval_num_tokens": 5522622.0, + "eval_runtime": 49.8769, + "eval_samples_per_second": 32.039, + "eval_steps_per_second": 4.01, + "step": 2244 + }, + { + "entropy": 0.19056345296628546, + "epoch": 6.016064257028113, + "grad_norm": 0.29291921854019165, + "learning_rate": 8.525722005566732e-05, + "loss": 0.14140020370483397, + "mean_token_accuracy": 0.9524310213146787, + "num_tokens": 5536511.0, + "step": 2250 + }, + { + "entropy": 0.13221844218671322, + "epoch": 6.149933065595716, + "grad_norm": 0.34443414211273193, + "learning_rate": 8.051496678427703e-05, + "loss": 0.0891877555847168, + "mean_token_accuracy": 0.97141546189785, + "num_tokens": 5663054.0, + "step": 2300 + }, + { + "entropy": 0.13209220491349696, + "epoch": 6.28380187416332, + "grad_norm": 0.4142569899559021, + "learning_rate": 7.58233043839285e-05, + "loss": 0.08825708389282226, + "mean_token_accuracy": 0.9711007869243622, + "num_tokens": 5790379.0, + "step": 2350 + }, + { + "entropy": 0.14063000075519086, + "epoch": 6.417670682730924, + "grad_norm": 0.47584882378578186, + "learning_rate": 7.119244835083612e-05, + "loss": 0.09473857879638672, + "mean_token_accuracy": 0.9696242707967758, + "num_tokens": 5908852.0, + "step": 2400 + }, + { + "entropy": 0.13315705463290214, + "epoch": 6.551539491298527, + "grad_norm": 0.3179854452610016, + "learning_rate": 6.66324817831086e-05, + "loss": 0.0911135196685791, + "mean_token_accuracy": 0.9703826290369034, + "num_tokens": 6033966.0, + "step": 2450 + }, + { + "entropy": 0.13554719373583793, + "epoch": 6.685408299866131, + "grad_norm": 0.49769842624664307, + "learning_rate": 6.215333342608944e-05, + "loss": 0.09153086662292481, + "mean_token_accuracy": 0.9705063331127167, + "num_tokens": 6156278.0, + "step": 2500 + }, + { + "entropy": 0.13915603026747703, + "epoch": 6.8192771084337345, + "grad_norm": 0.467375785112381, + "learning_rate": 5.7764756053780784e-05, + "loss": 0.09427680969238281, + "mean_token_accuracy": 0.9695158433914185, + "num_tokens": 6276774.0, + "step": 2550 + }, + { + "entropy": 0.13692217327654363, + "epoch": 6.953145917001339, + "grad_norm": 0.36858609318733215, + "learning_rate": 5.3476305233422516e-05, + "loss": 0.09176054954528809, + "mean_token_accuracy": 0.969379341006279, + "num_tokens": 6401444.0, + "step": 2600 + }, + { + "epoch": 7.0, + "eval_entropy": 0.24440797246992588, + "eval_loss": 0.891926646232605, + "eval_mean_token_accuracy": 0.8400790172815323, + "eval_num_tokens": 6443059.0, + "eval_runtime": 49.9579, + "eval_samples_per_second": 31.987, + "eval_steps_per_second": 4.003, + "step": 2618 + }, + { + "entropy": 0.1212329932234504, + "epoch": 7.085676037483267, + "grad_norm": 0.2522813677787781, + "learning_rate": 4.929731851946405e-05, + "loss": 0.07568974018096924, + "mean_token_accuracy": 0.975432159924748, + "num_tokens": 6525142.0, + "step": 2650 + }, + { + "entropy": 0.10857273273169994, + "epoch": 7.21954484605087, + "grad_norm": 0.2335294634103775, + "learning_rate": 4.5236895122230764e-05, + "loss": 0.06618132591247558, + "mean_token_accuracy": 0.9785620093345642, + "num_tokens": 6651930.0, + "step": 2700 + }, + { + "entropy": 0.1136517857015133, + "epoch": 7.353413654618474, + "grad_norm": 0.3027023375034332, + "learning_rate": 4.130387609555471e-05, + "loss": 0.06987609386444092, + "mean_token_accuracy": 0.9772803634405136, + "num_tokens": 6768469.0, + "step": 2750 + }, + { + "entropy": 0.10837352603673935, + "epoch": 7.4872824631860775, + "grad_norm": 0.3191539943218231, + "learning_rate": 3.750682508650807e-05, + "loss": 0.06725080013275146, + "mean_token_accuracy": 0.9786273115873336, + "num_tokens": 6892532.0, + "step": 2800 + }, + { + "entropy": 0.10379995822906495, + "epoch": 7.621151271753681, + "grad_norm": 0.21721133589744568, + "learning_rate": 3.3854009689154384e-05, + "loss": 0.06510573387145996, + "mean_token_accuracy": 0.9790040755271912, + "num_tokens": 7023373.0, + "step": 2850 + }, + { + "entropy": 0.10860319800674915, + "epoch": 7.755020080321285, + "grad_norm": 0.35063880681991577, + "learning_rate": 3.0353383442917245e-05, + "loss": 0.06781518936157227, + "mean_token_accuracy": 0.9782285010814666, + "num_tokens": 7146448.0, + "step": 2900 + }, + { + "entropy": 0.11041728757321835, + "epoch": 7.888888888888889, + "grad_norm": 0.27241161465644836, + "learning_rate": 2.7012568514763283e-05, + "loss": 0.06919246673583984, + "mean_token_accuracy": 0.9774098896980286, + "num_tokens": 7267680.0, + "step": 2950 + }, + { + "epoch": 8.0, + "eval_entropy": 0.216568651124835, + "eval_loss": 1.0052591562271118, + "eval_mean_token_accuracy": 0.841527444422245, + "eval_num_tokens": 7363496.0, + "eval_runtime": 49.8931, + "eval_samples_per_second": 32.028, + "eval_steps_per_second": 4.009, + "step": 2992 + }, + { + "entropy": 0.11074423059971646, + "epoch": 8.021419009370817, + "grad_norm": 0.12072166800498962, + "learning_rate": 2.3838839102906225e-05, + "loss": 0.07017123222351074, + "mean_token_accuracy": 0.9776547671568514, + "num_tokens": 7384150.0, + "step": 3000 + }, + { + "entropy": 0.10425103880465031, + "epoch": 8.15528781793842, + "grad_norm": 0.20287242531776428, + "learning_rate": 2.0839105598168276e-05, + "loss": 0.06177260398864746, + "mean_token_accuracy": 0.9801955896615983, + "num_tokens": 7502454.0, + "step": 3050 + }, + { + "entropy": 0.10014630381017924, + "epoch": 8.289156626506024, + "grad_norm": 0.1157577857375145, + "learning_rate": 1.8019899537486024e-05, + "loss": 0.05763424873352051, + "mean_token_accuracy": 0.9802741694450379, + "num_tokens": 7628419.0, + "step": 3100 + }, + { + "entropy": 0.09626397963613272, + "epoch": 8.423025435073628, + "grad_norm": 0.12889772653579712, + "learning_rate": 1.5387359382322228e-05, + "loss": 0.05830557346343994, + "mean_token_accuracy": 0.9807974797487259, + "num_tokens": 7751912.0, + "step": 3150 + }, + { + "entropy": 0.09667510379105806, + "epoch": 8.556894243641231, + "grad_norm": 0.18801453709602356, + "learning_rate": 1.2947217152949136e-05, + "loss": 0.058124661445617676, + "mean_token_accuracy": 0.98047631919384, + "num_tokens": 7877804.0, + "step": 3200 + }, + { + "entropy": 0.09806526392698288, + "epoch": 8.690763052208835, + "grad_norm": 0.11081992089748383, + "learning_rate": 1.0704785947705815e-05, + "loss": 0.05876843929290772, + "mean_token_accuracy": 0.9807141083478927, + "num_tokens": 8003296.0, + "step": 3250 + }, + { + "entropy": 0.1030188063904643, + "epoch": 8.824631860776439, + "grad_norm": 0.11520951986312866, + "learning_rate": 8.664948374404545e-06, + "loss": 0.06109299659729004, + "mean_token_accuracy": 0.9795061159133911, + "num_tokens": 8123859.0, + "step": 3300 + }, + { + "entropy": 0.10020156983286142, + "epoch": 8.958500669344042, + "grad_norm": 0.12751302123069763, + "learning_rate": 6.832145919075181e-06, + "loss": 0.05992648124694824, + "mean_token_accuracy": 0.9798818999528884, + "num_tokens": 8246971.0, + "step": 3350 + }, + { + "epoch": 9.0, + "eval_entropy": 0.20119483806192875, + "eval_loss": 1.0950454473495483, + "eval_mean_token_accuracy": 0.8413037645816803, + "eval_num_tokens": 8283933.0, + "eval_runtime": 49.8882, + "eval_samples_per_second": 32.032, + "eval_steps_per_second": 4.009, + "step": 3366 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.5767681026955366e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a67d7b089767a1f6708064da5a07f65728eada3e --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_bias": false, + "lora_dropout": 0.0016857635936814886, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj", + "up_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..769cd507cf763a85050736aef0cfe1e7f2e6e158 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/trainer_state.json @@ -0,0 +1,115 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 374, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.3424121737480164, + "epoch": 0.13386880856760375, + "grad_norm": 1.1671011447906494, + "learning_rate": 2.7185774847148058e-05, + "loss": 1.2469227600097657, + "mean_token_accuracy": 0.7243366748094558, + "num_tokens": 122643.0, + "step": 50 + }, + { + "entropy": 0.6559184691309929, + "epoch": 0.2677376171352075, + "grad_norm": 0.8399614691734314, + "learning_rate": 5.4926361425870564e-05, + "loss": 0.6245294189453126, + "mean_token_accuracy": 0.8268856984376908, + "num_tokens": 245702.0, + "step": 100 + }, + { + "entropy": 0.6082554489374161, + "epoch": 0.40160642570281124, + "grad_norm": 0.5691282749176025, + "learning_rate": 8.266694800459306e-05, + "loss": 0.5804204177856446, + "mean_token_accuracy": 0.8374843555688858, + "num_tokens": 371834.0, + "step": 150 + }, + { + "entropy": 0.5805956655740738, + "epoch": 0.535475234270415, + "grad_norm": 0.45651012659072876, + "learning_rate": 0.00011040753458331558, + "loss": 0.5495451354980468, + "mean_token_accuracy": 0.8450068402290344, + "num_tokens": 500948.0, + "step": 200 + }, + { + "entropy": 0.5810796636343002, + "epoch": 0.6693440428380187, + "grad_norm": 0.493431955575943, + "learning_rate": 0.00013814812116203808, + "loss": 0.5546633911132812, + "mean_token_accuracy": 0.8431127589941024, + "num_tokens": 621874.0, + "step": 250 + }, + { + "entropy": 0.5683389616012573, + "epoch": 0.8032128514056225, + "grad_norm": 0.4698493182659149, + "learning_rate": 0.00016588870774076058, + "loss": 0.5354468536376953, + "mean_token_accuracy": 0.8477097982168198, + "num_tokens": 746138.0, + "step": 300 + }, + { + "entropy": 0.5579970148205757, + "epoch": 0.9370816599732262, + "grad_norm": 0.5637578964233398, + "learning_rate": 0.0001936292943194831, + "loss": 0.5262400054931641, + "mean_token_accuracy": 0.8487933957576752, + "num_tokens": 868331.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5953671643137932, + "eval_loss": 0.5836789608001709, + "eval_mean_token_accuracy": 0.8373425653576851, + "eval_num_tokens": 920437.0, + "eval_runtime": 49.8765, + "eval_samples_per_second": 32.039, + "eval_steps_per_second": 4.01, + "step": 374 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.057598899339366e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a67d7b089767a1f6708064da5a07f65728eada3e --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_bias": false, + "lora_dropout": 0.0016857635936814886, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj", + "up_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..aa79e9718e46078d448f64116375ce19a3293a3b --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/trainer_state.json @@ -0,0 +1,884 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 500, + "global_step": 3740, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.3424121737480164, + "epoch": 0.13386880856760375, + "grad_norm": 1.1671011447906494, + "learning_rate": 2.7185774847148058e-05, + "loss": 1.2469227600097657, + "mean_token_accuracy": 0.7243366748094558, + "num_tokens": 122643.0, + "step": 50 + }, + { + "entropy": 0.6559184691309929, + "epoch": 0.2677376171352075, + "grad_norm": 0.8399614691734314, + "learning_rate": 5.4926361425870564e-05, + "loss": 0.6245294189453126, + "mean_token_accuracy": 0.8268856984376908, + "num_tokens": 245702.0, + "step": 100 + }, + { + "entropy": 0.6082554489374161, + "epoch": 0.40160642570281124, + "grad_norm": 0.5691282749176025, + "learning_rate": 8.266694800459306e-05, + "loss": 0.5804204177856446, + "mean_token_accuracy": 0.8374843555688858, + "num_tokens": 371834.0, + "step": 150 + }, + { + "entropy": 0.5805956655740738, + "epoch": 0.535475234270415, + "grad_norm": 0.45651012659072876, + "learning_rate": 0.00011040753458331558, + "loss": 0.5495451354980468, + "mean_token_accuracy": 0.8450068402290344, + "num_tokens": 500948.0, + "step": 200 + }, + { + "entropy": 0.5810796636343002, + "epoch": 0.6693440428380187, + "grad_norm": 0.493431955575943, + "learning_rate": 0.00013814812116203808, + "loss": 0.5546633911132812, + "mean_token_accuracy": 0.8431127589941024, + "num_tokens": 621874.0, + "step": 250 + }, + { + "entropy": 0.5683389616012573, + "epoch": 0.8032128514056225, + "grad_norm": 0.4698493182659149, + "learning_rate": 0.00016588870774076058, + "loss": 0.5354468536376953, + "mean_token_accuracy": 0.8477097982168198, + "num_tokens": 746138.0, + "step": 300 + }, + { + "entropy": 0.5579970148205757, + "epoch": 0.9370816599732262, + "grad_norm": 0.5637578964233398, + "learning_rate": 0.0001936292943194831, + "loss": 0.5262400054931641, + "mean_token_accuracy": 0.8487933957576752, + "num_tokens": 868331.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5953671643137932, + "eval_loss": 0.5836789608001709, + "eval_mean_token_accuracy": 0.8373425653576851, + "eval_num_tokens": 920437.0, + "eval_runtime": 49.8765, + "eval_samples_per_second": 32.039, + "eval_steps_per_second": 4.01, + "step": 374 + }, + { + "entropy": 0.5491663054986433, + "epoch": 1.069611780455154, + "grad_norm": 0.6051601767539978, + "learning_rate": 0.0002074713460228683, + "loss": 0.5132473373413086, + "mean_token_accuracy": 0.8526828770685677, + "num_tokens": 980838.0, + "step": 400 + }, + { + "entropy": 0.5334427100419998, + "epoch": 1.2034805890227578, + "grad_norm": 0.4304046630859375, + "learning_rate": 0.00020724550557791978, + "loss": 0.497388916015625, + "mean_token_accuracy": 0.8541187030076981, + "num_tokens": 1104763.0, + "step": 450 + }, + { + "entropy": 0.5297759872674942, + "epoch": 1.3373493975903614, + "grad_norm": 0.4243815541267395, + "learning_rate": 0.00020679431642677408, + "loss": 0.49724563598632815, + "mean_token_accuracy": 0.8557562667131424, + "num_tokens": 1230459.0, + "step": 500 + }, + { + "entropy": 0.5287212440371514, + "epoch": 1.4712182061579653, + "grad_norm": 0.5970085859298706, + "learning_rate": 0.0002061187609762355, + "loss": 0.4903334808349609, + "mean_token_accuracy": 0.8566980129480362, + "num_tokens": 1356368.0, + "step": 550 + }, + { + "entropy": 0.5196757692098618, + "epoch": 1.605087014725569, + "grad_norm": 0.353085458278656, + "learning_rate": 0.00020522031016209576, + "loss": 0.48056564331054685, + "mean_token_accuracy": 0.8591135066747665, + "num_tokens": 1484569.0, + "step": 600 + }, + { + "entropy": 0.5124905353784561, + "epoch": 1.7389558232931726, + "grad_norm": 0.383682519197464, + "learning_rate": 0.00020410092024635923, + "loss": 0.47599597930908205, + "mean_token_accuracy": 0.8606968414783478, + "num_tokens": 1609962.0, + "step": 650 + }, + { + "entropy": 0.5109778612852096, + "epoch": 1.8728246318607764, + "grad_norm": 0.35959598422050476, + "learning_rate": 0.00020276302855773176, + "loss": 0.47350929260253904, + "mean_token_accuracy": 0.862987876534462, + "num_tokens": 1729667.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.5806624293327332, + "eval_loss": 0.5672553181648254, + "eval_mean_token_accuracy": 0.8352122050523758, + "eval_num_tokens": 1840874.0, + "eval_runtime": 49.8669, + "eval_samples_per_second": 32.045, + "eval_steps_per_second": 4.011, + "step": 748 + }, + { + "entropy": 0.5177121743409321, + "epoch": 2.005354752342704, + "grad_norm": 0.33286598324775696, + "learning_rate": 0.00020120954818464854, + "loss": 0.4759817886352539, + "mean_token_accuracy": 0.8611076672871908, + "num_tokens": 1846410.0, + "step": 750 + }, + { + "entropy": 0.43643771946430204, + "epoch": 2.139223560910308, + "grad_norm": 0.3718855679035187, + "learning_rate": 0.00019944386163239588, + "loss": 0.3936069107055664, + "mean_token_accuracy": 0.8780064672231674, + "num_tokens": 1971547.0, + "step": 800 + }, + { + "entropy": 0.44890178814530374, + "epoch": 2.2730923694779115, + "grad_norm": 0.4402499496936798, + "learning_rate": 0.0001974698134581373, + "loss": 0.40427154541015625, + "mean_token_accuracy": 0.8763552361726761, + "num_tokens": 2088560.0, + "step": 850 + }, + { + "entropy": 0.44539201706647874, + "epoch": 2.4069611780455156, + "grad_norm": 0.437489777803421, + "learning_rate": 0.00019529170189988115, + "loss": 0.4049137878417969, + "mean_token_accuracy": 0.8759620261192321, + "num_tokens": 2210146.0, + "step": 900 + }, + { + "entropy": 0.4456777948141098, + "epoch": 2.540829986613119, + "grad_norm": 0.5221232175827026, + "learning_rate": 0.0001929142695176156, + "loss": 0.4049659729003906, + "mean_token_accuracy": 0.8767672145366668, + "num_tokens": 2331354.0, + "step": 950 + }, + { + "entropy": 0.44445386946201326, + "epoch": 2.674698795180723, + "grad_norm": 0.48457658290863037, + "learning_rate": 0.00019034269286698953, + "loss": 0.4065634536743164, + "mean_token_accuracy": 0.8764267575740814, + "num_tokens": 2452558.0, + "step": 1000 + }, + { + "entropy": 0.44160154819488523, + "epoch": 2.8085676037483265, + "grad_norm": 0.36902090907096863, + "learning_rate": 0.00018758257122802307, + "loss": 0.4023736953735352, + "mean_token_accuracy": 0.8786762475967407, + "num_tokens": 2582428.0, + "step": 1050 + }, + { + "entropy": 0.4473246121406555, + "epoch": 2.9424364123159306, + "grad_norm": 0.3760707676410675, + "learning_rate": 0.00018463991441338993, + "loss": 0.40938362121582034, + "mean_token_accuracy": 0.8754651814699173, + "num_tokens": 2707532.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.4759794683754444, + "eval_loss": 0.5672900676727295, + "eval_mean_token_accuracy": 0.843816783130169, + "eval_num_tokens": 2761311.0, + "eval_runtime": 49.9422, + "eval_samples_per_second": 31.997, + "eval_steps_per_second": 4.005, + "step": 1122 + }, + { + "entropy": 0.3893387094892637, + "epoch": 3.074966532797858, + "grad_norm": 0.4708668291568756, + "learning_rate": 0.00018152112968281706, + "loss": 0.3442184829711914, + "mean_token_accuracy": 0.8925870771359916, + "num_tokens": 2834344.0, + "step": 1150 + }, + { + "entropy": 0.36019587606191633, + "epoch": 3.208835341365462, + "grad_norm": 0.49764320254325867, + "learning_rate": 0.00017823300779209423, + "loss": 0.3140977668762207, + "mean_token_accuracy": 0.8975517880916596, + "num_tokens": 2952748.0, + "step": 1200 + }, + { + "entropy": 0.3615420612692833, + "epoch": 3.3427041499330654, + "grad_norm": 0.49308347702026367, + "learning_rate": 0.0001747827082070698, + "loss": 0.31728214263916016, + "mean_token_accuracy": 0.8988397961854935, + "num_tokens": 3080333.0, + "step": 1250 + }, + { + "entropy": 0.36845425054430964, + "epoch": 3.4765729585006695, + "grad_norm": 0.47647032141685486, + "learning_rate": 0.00017117774351482735, + "loss": 0.3203315734863281, + "mean_token_accuracy": 0.8968151319026947, + "num_tokens": 3206920.0, + "step": 1300 + }, + { + "entropy": 0.37361170917749403, + "epoch": 3.610441767068273, + "grad_norm": 0.5282555818557739, + "learning_rate": 0.000167425963065986, + "loss": 0.32945499420166013, + "mean_token_accuracy": 0.8949814730882645, + "num_tokens": 3327931.0, + "step": 1350 + }, + { + "entropy": 0.3739692223072052, + "epoch": 3.7443105756358768, + "grad_norm": 0.41107845306396484, + "learning_rate": 0.00016353553588374095, + "loss": 0.32604251861572264, + "mean_token_accuracy": 0.896143769621849, + "num_tokens": 3443874.0, + "step": 1400 + }, + { + "entropy": 0.37787127375602725, + "epoch": 3.878179384203481, + "grad_norm": 0.39750751852989197, + "learning_rate": 0.00015951493287685788, + "loss": 0.3352021026611328, + "mean_token_accuracy": 0.8935402005910873, + "num_tokens": 3573158.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.4485042405128479, + "eval_loss": 0.5753094553947449, + "eval_mean_token_accuracy": 0.8450798985362052, + "eval_num_tokens": 3681748.0, + "eval_runtime": 49.9605, + "eval_samples_per_second": 31.985, + "eval_steps_per_second": 4.003, + "step": 1496 + }, + { + "entropy": 0.3735277107869736, + "epoch": 4.010709504685408, + "grad_norm": 0.41153082251548767, + "learning_rate": 0.00015537290839535005, + "loss": 0.327095947265625, + "mean_token_accuracy": 0.8959399853089843, + "num_tokens": 3691654.0, + "step": 1500 + }, + { + "entropy": 0.2689096394181252, + "epoch": 4.144578313253012, + "grad_norm": 0.5363789200782776, + "learning_rate": 0.00015111848116899814, + "loss": 0.2247480583190918, + "mean_token_accuracy": 0.9249986118078232, + "num_tokens": 3813863.0, + "step": 1550 + }, + { + "entropy": 0.27684757232666013, + "epoch": 4.278447121820616, + "grad_norm": 0.5589100122451782, + "learning_rate": 0.00014676091467021694, + "loss": 0.23430667877197264, + "mean_token_accuracy": 0.9212016260623932, + "num_tokens": 3942009.0, + "step": 1600 + }, + { + "entropy": 0.27285940989851953, + "epoch": 4.412315930388219, + "grad_norm": 0.4415719211101532, + "learning_rate": 0.00014230969694402636, + "loss": 0.23151195526123047, + "mean_token_accuracy": 0.922565575838089, + "num_tokens": 4067146.0, + "step": 1650 + }, + { + "entropy": 0.28027778953313826, + "epoch": 4.546184738955823, + "grad_norm": 0.544822096824646, + "learning_rate": 0.0001377745199490439, + "loss": 0.23426279067993164, + "mean_token_accuracy": 0.9214058065414429, + "num_tokens": 4186586.0, + "step": 1700 + }, + { + "entropy": 0.2855076715350151, + "epoch": 4.680053547523427, + "grad_norm": 0.47745293378829956, + "learning_rate": 0.00013316525845448153, + "loss": 0.2384078598022461, + "mean_token_accuracy": 0.9208149307966232, + "num_tokens": 4307001.0, + "step": 1750 + }, + { + "entropy": 0.28488670364022256, + "epoch": 4.813922356091031, + "grad_norm": 0.6087909936904907, + "learning_rate": 0.00012849194853909585, + "loss": 0.24047565460205078, + "mean_token_accuracy": 0.9198217475414276, + "num_tokens": 4429513.0, + "step": 1800 + }, + { + "entropy": 0.2799084801971912, + "epoch": 4.947791164658635, + "grad_norm": 0.4444660544395447, + "learning_rate": 0.00012376476573890707, + "loss": 0.23463037490844726, + "mean_token_accuracy": 0.9206935846805573, + "num_tokens": 4557562.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.3690616069734097, + "eval_loss": 0.6533966064453125, + "eval_mean_token_accuracy": 0.8432427588105201, + "eval_num_tokens": 4602185.0, + "eval_runtime": 49.893, + "eval_samples_per_second": 32.029, + "eval_steps_per_second": 4.009, + "step": 1870 + }, + { + "entropy": 0.2288022293436407, + "epoch": 5.080321285140562, + "grad_norm": 0.4002731442451477, + "learning_rate": 0.0001189940028912678, + "loss": 0.17887537002563478, + "mean_token_accuracy": 0.9409801201386885, + "num_tokens": 4679483.0, + "step": 1900 + }, + { + "entropy": 0.192287794649601, + "epoch": 5.214190093708166, + "grad_norm": 0.4603167474269867, + "learning_rate": 0.00011419004772352316, + "loss": 0.14474411010742189, + "mean_token_accuracy": 0.9518448287248611, + "num_tokens": 4800131.0, + "step": 1950 + }, + { + "entropy": 0.19072901770472528, + "epoch": 5.34805890227577, + "grad_norm": 0.6232134103775024, + "learning_rate": 0.00010936336023505987, + "loss": 0.14428988456726075, + "mean_token_accuracy": 0.9511868554353714, + "num_tokens": 4923382.0, + "step": 2000 + }, + { + "entropy": 0.19722454741597176, + "epoch": 5.481927710843373, + "grad_norm": 0.49368390440940857, + "learning_rate": 0.00010452444992199237, + "loss": 0.15026931762695311, + "mean_token_accuracy": 0.9493078935146332, + "num_tokens": 5042200.0, + "step": 2050 + }, + { + "entropy": 0.1921817621588707, + "epoch": 5.615796519410977, + "grad_norm": 0.6033351421356201, + "learning_rate": 9.9683852894076e-05, + "loss": 0.15000157356262206, + "mean_token_accuracy": 0.9497224026918412, + "num_tokens": 5168285.0, + "step": 2100 + }, + { + "entropy": 0.19455582827329634, + "epoch": 5.749665327978581, + "grad_norm": 0.4534570276737213, + "learning_rate": 9.485210893367247e-05, + "loss": 0.14963313102722167, + "mean_token_accuracy": 0.94916872382164, + "num_tokens": 5289880.0, + "step": 2150 + }, + { + "entropy": 0.18738240271806716, + "epoch": 5.883534136546185, + "grad_norm": 0.5815815329551697, + "learning_rate": 9.003973854671866e-05, + "loss": 0.14579124450683595, + "mean_token_accuracy": 0.9498835545778275, + "num_tokens": 5413325.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.29356860227882864, + "eval_loss": 0.7589722275733948, + "eval_mean_token_accuracy": 0.8418136316537858, + "eval_num_tokens": 5522622.0, + "eval_runtime": 49.8769, + "eval_samples_per_second": 32.039, + "eval_steps_per_second": 4.01, + "step": 2244 + }, + { + "entropy": 0.19056345296628546, + "epoch": 6.016064257028113, + "grad_norm": 0.29291921854019165, + "learning_rate": 8.525722005566732e-05, + "loss": 0.14140020370483397, + "mean_token_accuracy": 0.9524310213146787, + "num_tokens": 5536511.0, + "step": 2250 + }, + { + "entropy": 0.13221844218671322, + "epoch": 6.149933065595716, + "grad_norm": 0.34443414211273193, + "learning_rate": 8.051496678427703e-05, + "loss": 0.0891877555847168, + "mean_token_accuracy": 0.97141546189785, + "num_tokens": 5663054.0, + "step": 2300 + }, + { + "entropy": 0.13209220491349696, + "epoch": 6.28380187416332, + "grad_norm": 0.4142569899559021, + "learning_rate": 7.58233043839285e-05, + "loss": 0.08825708389282226, + "mean_token_accuracy": 0.9711007869243622, + "num_tokens": 5790379.0, + "step": 2350 + }, + { + "entropy": 0.14063000075519086, + "epoch": 6.417670682730924, + "grad_norm": 0.47584882378578186, + "learning_rate": 7.119244835083612e-05, + "loss": 0.09473857879638672, + "mean_token_accuracy": 0.9696242707967758, + "num_tokens": 5908852.0, + "step": 2400 + }, + { + "entropy": 0.13315705463290214, + "epoch": 6.551539491298527, + "grad_norm": 0.3179854452610016, + "learning_rate": 6.66324817831086e-05, + "loss": 0.0911135196685791, + "mean_token_accuracy": 0.9703826290369034, + "num_tokens": 6033966.0, + "step": 2450 + }, + { + "entropy": 0.13554719373583793, + "epoch": 6.685408299866131, + "grad_norm": 0.49769842624664307, + "learning_rate": 6.215333342608944e-05, + "loss": 0.09153086662292481, + "mean_token_accuracy": 0.9705063331127167, + "num_tokens": 6156278.0, + "step": 2500 + }, + { + "entropy": 0.13915603026747703, + "epoch": 6.8192771084337345, + "grad_norm": 0.467375785112381, + "learning_rate": 5.7764756053780784e-05, + "loss": 0.09427680969238281, + "mean_token_accuracy": 0.9695158433914185, + "num_tokens": 6276774.0, + "step": 2550 + }, + { + "entropy": 0.13692217327654363, + "epoch": 6.953145917001339, + "grad_norm": 0.36858609318733215, + "learning_rate": 5.3476305233422516e-05, + "loss": 0.09176054954528809, + "mean_token_accuracy": 0.969379341006279, + "num_tokens": 6401444.0, + "step": 2600 + }, + { + "epoch": 7.0, + "eval_entropy": 0.24440797246992588, + "eval_loss": 0.891926646232605, + "eval_mean_token_accuracy": 0.8400790172815323, + "eval_num_tokens": 6443059.0, + "eval_runtime": 49.9579, + "eval_samples_per_second": 31.987, + "eval_steps_per_second": 4.003, + "step": 2618 + }, + { + "entropy": 0.1212329932234504, + "epoch": 7.085676037483267, + "grad_norm": 0.2522813677787781, + "learning_rate": 4.929731851946405e-05, + "loss": 0.07568974018096924, + "mean_token_accuracy": 0.975432159924748, + "num_tokens": 6525142.0, + "step": 2650 + }, + { + "entropy": 0.10857273273169994, + "epoch": 7.21954484605087, + "grad_norm": 0.2335294634103775, + "learning_rate": 4.5236895122230764e-05, + "loss": 0.06618132591247558, + "mean_token_accuracy": 0.9785620093345642, + "num_tokens": 6651930.0, + "step": 2700 + }, + { + "entropy": 0.1136517857015133, + "epoch": 7.353413654618474, + "grad_norm": 0.3027023375034332, + "learning_rate": 4.130387609555471e-05, + "loss": 0.06987609386444092, + "mean_token_accuracy": 0.9772803634405136, + "num_tokens": 6768469.0, + "step": 2750 + }, + { + "entropy": 0.10837352603673935, + "epoch": 7.4872824631860775, + "grad_norm": 0.3191539943218231, + "learning_rate": 3.750682508650807e-05, + "loss": 0.06725080013275146, + "mean_token_accuracy": 0.9786273115873336, + "num_tokens": 6892532.0, + "step": 2800 + }, + { + "entropy": 0.10379995822906495, + "epoch": 7.621151271753681, + "grad_norm": 0.21721133589744568, + "learning_rate": 3.3854009689154384e-05, + "loss": 0.06510573387145996, + "mean_token_accuracy": 0.9790040755271912, + "num_tokens": 7023373.0, + "step": 2850 + }, + { + "entropy": 0.10860319800674915, + "epoch": 7.755020080321285, + "grad_norm": 0.35063880681991577, + "learning_rate": 3.0353383442917245e-05, + "loss": 0.06781518936157227, + "mean_token_accuracy": 0.9782285010814666, + "num_tokens": 7146448.0, + "step": 2900 + }, + { + "entropy": 0.11041728757321835, + "epoch": 7.888888888888889, + "grad_norm": 0.27241161465644836, + "learning_rate": 2.7012568514763283e-05, + "loss": 0.06919246673583984, + "mean_token_accuracy": 0.9774098896980286, + "num_tokens": 7267680.0, + "step": 2950 + }, + { + "epoch": 8.0, + "eval_entropy": 0.216568651124835, + "eval_loss": 1.0052591562271118, + "eval_mean_token_accuracy": 0.841527444422245, + "eval_num_tokens": 7363496.0, + "eval_runtime": 49.8931, + "eval_samples_per_second": 32.028, + "eval_steps_per_second": 4.009, + "step": 2992 + }, + { + "entropy": 0.11074423059971646, + "epoch": 8.021419009370817, + "grad_norm": 0.12072166800498962, + "learning_rate": 2.3838839102906225e-05, + "loss": 0.07017123222351074, + "mean_token_accuracy": 0.9776547671568514, + "num_tokens": 7384150.0, + "step": 3000 + }, + { + "entropy": 0.10425103880465031, + "epoch": 8.15528781793842, + "grad_norm": 0.20287242531776428, + "learning_rate": 2.0839105598168276e-05, + "loss": 0.06177260398864746, + "mean_token_accuracy": 0.9801955896615983, + "num_tokens": 7502454.0, + "step": 3050 + }, + { + "entropy": 0.10014630381017924, + "epoch": 8.289156626506024, + "grad_norm": 0.1157577857375145, + "learning_rate": 1.8019899537486024e-05, + "loss": 0.05763424873352051, + "mean_token_accuracy": 0.9802741694450379, + "num_tokens": 7628419.0, + "step": 3100 + }, + { + "entropy": 0.09626397963613272, + "epoch": 8.423025435073628, + "grad_norm": 0.12889772653579712, + "learning_rate": 1.5387359382322228e-05, + "loss": 0.05830557346343994, + "mean_token_accuracy": 0.9807974797487259, + "num_tokens": 7751912.0, + "step": 3150 + }, + { + "entropy": 0.09667510379105806, + "epoch": 8.556894243641231, + "grad_norm": 0.18801453709602356, + "learning_rate": 1.2947217152949136e-05, + "loss": 0.058124661445617676, + "mean_token_accuracy": 0.98047631919384, + "num_tokens": 7877804.0, + "step": 3200 + }, + { + "entropy": 0.09806526392698288, + "epoch": 8.690763052208835, + "grad_norm": 0.11081992089748383, + "learning_rate": 1.0704785947705815e-05, + "loss": 0.05876843929290772, + "mean_token_accuracy": 0.9807141083478927, + "num_tokens": 8003296.0, + "step": 3250 + }, + { + "entropy": 0.1030188063904643, + "epoch": 8.824631860776439, + "grad_norm": 0.11520951986312866, + "learning_rate": 8.664948374404545e-06, + "loss": 0.06109299659729004, + "mean_token_accuracy": 0.9795061159133911, + "num_tokens": 8123859.0, + "step": 3300 + }, + { + "entropy": 0.10020156983286142, + "epoch": 8.958500669344042, + "grad_norm": 0.12751302123069763, + "learning_rate": 6.832145919075181e-06, + "loss": 0.05992648124694824, + "mean_token_accuracy": 0.9798818999528884, + "num_tokens": 8246971.0, + "step": 3350 + }, + { + "epoch": 9.0, + "eval_entropy": 0.20119483806192875, + "eval_loss": 1.0950454473495483, + "eval_mean_token_accuracy": 0.8413037645816803, + "eval_num_tokens": 8283933.0, + "eval_runtime": 49.8882, + "eval_samples_per_second": 32.032, + "eval_steps_per_second": 4.009, + "step": 3366 + }, + { + "entropy": 0.10281606312050964, + "epoch": 9.09103078982597, + "grad_norm": 0.15680046379566193, + "learning_rate": 5.210369275196194e-06, + "loss": 0.06013503551483154, + "mean_token_accuracy": 0.9802021769562153, + "num_tokens": 8361698.0, + "step": 3400 + }, + { + "entropy": 0.1000148943066597, + "epoch": 9.224899598393574, + "grad_norm": 0.13952124118804932, + "learning_rate": 3.803149654468773e-06, + "loss": 0.05827256202697754, + "mean_token_accuracy": 0.9804249608516693, + "num_tokens": 8479562.0, + "step": 3450 + }, + { + "entropy": 0.0985061563923955, + "epoch": 9.358768406961179, + "grad_norm": 0.15697507560253143, + "learning_rate": 2.6135510980540095e-06, + "loss": 0.05692038536071777, + "mean_token_accuracy": 0.9809943473339081, + "num_tokens": 8601271.0, + "step": 3500 + }, + { + "entropy": 0.09138282071799039, + "epoch": 9.492637215528783, + "grad_norm": 0.09984570741653442, + "learning_rate": 1.6441638050141134e-06, + "loss": 0.0537615442276001, + "mean_token_accuracy": 0.9819633334875106, + "num_tokens": 8730858.0, + "step": 3550 + }, + { + "entropy": 0.09622499626129866, + "epoch": 9.626506024096386, + "grad_norm": 0.15339775383472443, + "learning_rate": 8.970984924845772e-07, + "loss": 0.05584990501403809, + "mean_token_accuracy": 0.9813891124725341, + "num_tokens": 8855189.0, + "step": 3600 + }, + { + "entropy": 0.09465554103255272, + "epoch": 9.76037483266399, + "grad_norm": 0.15608705580234528, + "learning_rate": 3.7398179985693506e-07, + "loss": 0.055440669059753415, + "mean_token_accuracy": 0.981783646941185, + "num_tokens": 8979510.0, + "step": 3650 + }, + { + "entropy": 0.09096938490867615, + "epoch": 9.894243641231594, + "grad_norm": 0.1507030725479126, + "learning_rate": 7.595274697899605e-08, + "loss": 0.05279422283172607, + "mean_token_accuracy": 0.9820951598882676, + "num_tokens": 9111678.0, + "step": 3700 + }, + { + "epoch": 10.0, + "eval_entropy": 0.19496785469353198, + "eval_loss": 1.1435716152191162, + "eval_mean_token_accuracy": 0.8412449145317078, + "eval_num_tokens": 9204370.0, + "eval_runtime": 49.8996, + "eval_samples_per_second": 32.024, + "eval_steps_per_second": 4.008, + "step": 3740 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.0880231228879974e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a67d7b089767a1f6708064da5a07f65728eada3e --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_bias": false, + "lora_dropout": 0.0016857635936814886, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj", + "up_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2a73aa8e422ab932ece21b234fc42868cf085a1d --- /dev/null +++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/trainer_state.json @@ -0,0 +1,196 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 748, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.3424121737480164, + "epoch": 0.13386880856760375, + "grad_norm": 1.1671011447906494, + "learning_rate": 2.7185774847148058e-05, + "loss": 1.2469227600097657, + "mean_token_accuracy": 0.7243366748094558, + "num_tokens": 122643.0, + "step": 50 + }, + { + "entropy": 0.6559184691309929, + "epoch": 0.2677376171352075, + "grad_norm": 0.8399614691734314, + "learning_rate": 5.4926361425870564e-05, + "loss": 0.6245294189453126, + "mean_token_accuracy": 0.8268856984376908, + "num_tokens": 245702.0, + "step": 100 + }, + { + "entropy": 0.6082554489374161, + "epoch": 0.40160642570281124, + "grad_norm": 0.5691282749176025, + "learning_rate": 8.266694800459306e-05, + "loss": 0.5804204177856446, + "mean_token_accuracy": 0.8374843555688858, + "num_tokens": 371834.0, + "step": 150 + }, + { + "entropy": 0.5805956655740738, + "epoch": 0.535475234270415, + "grad_norm": 0.45651012659072876, + "learning_rate": 0.00011040753458331558, + "loss": 0.5495451354980468, + "mean_token_accuracy": 0.8450068402290344, + "num_tokens": 500948.0, + "step": 200 + }, + { + "entropy": 0.5810796636343002, + "epoch": 0.6693440428380187, + "grad_norm": 0.493431955575943, + "learning_rate": 0.00013814812116203808, + "loss": 0.5546633911132812, + "mean_token_accuracy": 0.8431127589941024, + "num_tokens": 621874.0, + "step": 250 + }, + { + "entropy": 0.5683389616012573, + "epoch": 0.8032128514056225, + "grad_norm": 0.4698493182659149, + "learning_rate": 0.00016588870774076058, + "loss": 0.5354468536376953, + "mean_token_accuracy": 0.8477097982168198, + "num_tokens": 746138.0, + "step": 300 + }, + { + "entropy": 0.5579970148205757, + "epoch": 0.9370816599732262, + "grad_norm": 0.5637578964233398, + "learning_rate": 0.0001936292943194831, + "loss": 0.5262400054931641, + "mean_token_accuracy": 0.8487933957576752, + "num_tokens": 868331.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5953671643137932, + "eval_loss": 0.5836789608001709, + "eval_mean_token_accuracy": 0.8373425653576851, + "eval_num_tokens": 920437.0, + "eval_runtime": 49.8765, + "eval_samples_per_second": 32.039, + "eval_steps_per_second": 4.01, + "step": 374 + }, + { + "entropy": 0.5491663054986433, + "epoch": 1.069611780455154, + "grad_norm": 0.6051601767539978, + "learning_rate": 0.0002074713460228683, + "loss": 0.5132473373413086, + "mean_token_accuracy": 0.8526828770685677, + "num_tokens": 980838.0, + "step": 400 + }, + { + "entropy": 0.5334427100419998, + "epoch": 1.2034805890227578, + "grad_norm": 0.4304046630859375, + "learning_rate": 0.00020724550557791978, + "loss": 0.497388916015625, + "mean_token_accuracy": 0.8541187030076981, + "num_tokens": 1104763.0, + "step": 450 + }, + { + "entropy": 0.5297759872674942, + "epoch": 1.3373493975903614, + "grad_norm": 0.4243815541267395, + "learning_rate": 0.00020679431642677408, + "loss": 0.49724563598632815, + "mean_token_accuracy": 0.8557562667131424, + "num_tokens": 1230459.0, + "step": 500 + }, + { + "entropy": 0.5287212440371514, + "epoch": 1.4712182061579653, + "grad_norm": 0.5970085859298706, + "learning_rate": 0.0002061187609762355, + "loss": 0.4903334808349609, + "mean_token_accuracy": 0.8566980129480362, + "num_tokens": 1356368.0, + "step": 550 + }, + { + "entropy": 0.5196757692098618, + "epoch": 1.605087014725569, + "grad_norm": 0.353085458278656, + "learning_rate": 0.00020522031016209576, + "loss": 0.48056564331054685, + "mean_token_accuracy": 0.8591135066747665, + "num_tokens": 1484569.0, + "step": 600 + }, + { + "entropy": 0.5124905353784561, + "epoch": 1.7389558232931726, + "grad_norm": 0.383682519197464, + "learning_rate": 0.00020410092024635923, + "loss": 0.47599597930908205, + "mean_token_accuracy": 0.8606968414783478, + "num_tokens": 1609962.0, + "step": 650 + }, + { + "entropy": 0.5109778612852096, + "epoch": 1.8728246318607764, + "grad_norm": 0.35959598422050476, + "learning_rate": 0.00020276302855773176, + "loss": 0.47350929260253904, + "mean_token_accuracy": 0.862987876534462, + "num_tokens": 1729667.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.5806624293327332, + "eval_loss": 0.5672553181648254, + "eval_mean_token_accuracy": 0.8352122050523758, + "eval_num_tokens": 1840874.0, + "eval_runtime": 49.8669, + "eval_samples_per_second": 32.045, + "eval_steps_per_second": 4.011, + "step": 748 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0167293607351706e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3020/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3020/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3020/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3020/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3020/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3020/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3020/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3020/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3020/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3020/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3020/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1875666670146c89e02e027ba11c310a297e6706 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3020/trainer_state.json @@ -0,0 +1,3205 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.277442702050664, + "eval_steps": 20, + "global_step": 3020, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.5588208342934016e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4f687182abb2b000330496898ed667ad4e50e6ea --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/trainer_state.json @@ -0,0 +1,3226 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.325693606755126, + "eval_steps": 20, + "global_step": 3040, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.5884529474138624e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1e13f384816b0831238b7adb2a32741d3037380c --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/trainer_state.json @@ -0,0 +1,3247 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.37394451145959, + "eval_steps": 20, + "global_step": 3060, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.619818601629747e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fd60b0849e427c00dfc36e07c3f23630ca683023 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/trainer_state.json @@ -0,0 +1,3268 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.422195416164053, + "eval_steps": 20, + "global_step": 3080, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.6497921431486976e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fae6dfa855f3a17fef64d96950ad7a68b07fea93 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/trainer_state.json @@ -0,0 +1,3289 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.470446320868517, + "eval_steps": 20, + "global_step": 3100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.677595678403021e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..18dbac322528fbf5ff116f0f99e4fde0653df402 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/trainer_state.json @@ -0,0 +1,3310 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.518697225572979, + "eval_steps": 20, + "global_step": 3120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.7098589021200896e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..444da11af0e9943c87bc71d5162369774b3ba03e --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/trainer_state.json @@ -0,0 +1,3331 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.566948130277443, + "eval_steps": 20, + "global_step": 3140, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.742792663052749e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5936cad0de2afc112537f6b102cd2808c80804b3 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/trainer_state.json @@ -0,0 +1,3352 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.615199034981906, + "eval_steps": 20, + "global_step": 3160, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.774101999182285e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e5bce9e9cb5ed34732dd937291baa9ff94108599 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/trainer_state.json @@ -0,0 +1,3373 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.66344993968637, + "eval_steps": 20, + "global_step": 3180, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.8035704378642944e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dc5c64d87da635eca649acbc38be83e98f05fb3f --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/trainer_state.json @@ -0,0 +1,370 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7720144752714113, + "eval_steps": 20, + "global_step": 320, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.911570708086784e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bf4f54c3f38ce689e2ef9480b7a3174042ecd8a2 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/trainer_state.json @@ -0,0 +1,3394 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.711700844390832, + "eval_steps": 20, + "global_step": 3200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.831581646062029e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d19773490f58e7da9635012ee482212f6baeb73f --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/trainer_state.json @@ -0,0 +1,3415 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.759951749095295, + "eval_steps": 20, + "global_step": 3220, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.86267626948736e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..706ca903c2185329afdcaaae04165376f19117d1 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/trainer_state.json @@ -0,0 +1,3436 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.808202653799759, + "eval_steps": 20, + "global_step": 3240, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.894677262114867e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..911c23180706772e28d978c164391e58ba3ddb19 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/trainer_state.json @@ -0,0 +1,3457 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.856453558504222, + "eval_steps": 20, + "global_step": 3260, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.926100994357299e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b96b1670bb8c47dcc688f21710f259d6fef34216 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/trainer_state.json @@ -0,0 +1,3478 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.904704463208685, + "eval_steps": 20, + "global_step": 3280, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.9552209648800256e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..21898c2c92025c319d9087f7528cd9bfe2dd1bc4 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/trainer_state.json @@ -0,0 +1,3499 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.952955367913148, + "eval_steps": 20, + "global_step": 3300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.985129388611635e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a3946a9434b242725d4a6aa74ac671d305285f00 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/trainer_state.json @@ -0,0 +1,3520 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.0, + "eval_steps": 20, + "global_step": 3320, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.0124324408720384e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dd0b02e1edf669fdc6a109b632e60ec2df1a4897 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/trainer_state.json @@ -0,0 +1,3541 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.048250904704464, + "eval_steps": 20, + "global_step": 3340, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.0405738846444544e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fc2925148ae483c103bc106ffb35da68276827f6 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/trainer_state.json @@ -0,0 +1,3562 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.096501809408926, + "eval_steps": 20, + "global_step": 3360, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.0682313448623104e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..34385e35241ec7cae6cc44d36f3309df3c900c4d --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/trainer_state.json @@ -0,0 +1,3583 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.14475271411339, + "eval_steps": 20, + "global_step": 3380, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.098549834660147e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d966192d99130a657b653bcb78a3f79b8e8161d3 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/trainer_state.json @@ -0,0 +1,391 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8202653799758746, + "eval_steps": 20, + "global_step": 340, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.22209455669248e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..764e62f67588886399dc7b313c296d25e38cdfcb --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/trainer_state.json @@ -0,0 +1,3604 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.193003618817853, + "eval_steps": 20, + "global_step": 3400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.1325131606088704e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..27e85f1b827de56d9b45d6d5c3bee4ce0c004a48 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/trainer_state.json @@ -0,0 +1,3625 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.241254523522317, + "eval_steps": 20, + "global_step": 3420, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.1628334103469056e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fd171ceb05290457aaefc0622ba9194ac2d36953 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/trainer_state.json @@ -0,0 +1,3646 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.289505428226779, + "eval_steps": 20, + "global_step": 3440, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.189239553083699e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bcc509f733688600cad3436810e33db0aadbe3a0 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/trainer_state.json @@ -0,0 +1,3667 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.337756332931242, + "eval_steps": 20, + "global_step": 3460, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.2187555101510656e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..aebed85a25f7b5ce60770d403ed311a4f7f1af87 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/trainer_state.json @@ -0,0 +1,3688 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.386007237635706, + "eval_steps": 20, + "global_step": 3480, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.250758262718771e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..33bfa6a3eadf85968bcbd375da520b4e3478477d --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/trainer_state.json @@ -0,0 +1,3709 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.43425814234017, + "eval_steps": 20, + "global_step": 3500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.281882805127475e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4ecc3092b47c9bcc5c5b9f00c9c3d8a928237795 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/trainer_state.json @@ -0,0 +1,3730 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.482509047044632, + "eval_steps": 20, + "global_step": 3520, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.311726111071744e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ab52c6775bd9bff5aae74c43afefe1f05682e1d2 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/trainer_state.json @@ -0,0 +1,3751 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.530759951749095, + "eval_steps": 20, + "global_step": 3540, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.3430284074404864e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b04706628dec08abeb10745f419aa6be3ff5cb57 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/trainer_state.json @@ -0,0 +1,3772 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.579010856453559, + "eval_steps": 20, + "global_step": 3560, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.3726622805011456e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..20b0f94c9dc2c641cc7f11e769f02bffc1bdd675 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/trainer_state.json @@ -0,0 +1,3793 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.627261761158021, + "eval_steps": 20, + "global_step": 3580, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.405333810344243e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..246ac18182157e7f1f83a5f33ec269dfcd845d63 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/trainer_state.json @@ -0,0 +1,412 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8685162846803377, + "eval_steps": 20, + "global_step": 360, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.512766279860224e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..caa6a6d9ad15698c072c595439b01254e42a37ff --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/trainer_state.json @@ -0,0 +1,3814 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.675512665862485, + "eval_steps": 20, + "global_step": 3600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.4358529333246976e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e767c9f7195566248c41fd93951a5336b7b16f18 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/trainer_state.json @@ -0,0 +1,3835 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.723763570566948, + "eval_steps": 20, + "global_step": 3620, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.4669246775274496e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0932f4456d576c6b0206229802fcb19b93841b2b --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/trainer_state.json @@ -0,0 +1,3856 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.772014475271412, + "eval_steps": 20, + "global_step": 3640, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.498913350573568e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8f624b42e08260bf994e70c06a6ca43df45405f3 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/trainer_state.json @@ -0,0 +1,3877 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.820265379975874, + "eval_steps": 20, + "global_step": 3660, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.5266922466651136e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8ee7507dac4ad8eeb47b67e89663c449c02e6524 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/trainer_state.json @@ -0,0 +1,3898 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.868516284680338, + "eval_steps": 20, + "global_step": 3680, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.557906546023936e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b7e6ef7740c4778fd5ad573eefd3ad5052b3c517 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/trainer_state.json @@ -0,0 +1,3919 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.916767189384801, + "eval_steps": 20, + "global_step": 3700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + }, + { + "entropy": 0.15672272052615882, + "epoch": 8.916767189384801, + "grad_norm": 0.610146701335907, + "learning_rate": 8.429930072725457e-06, + "loss": 0.09335047006607056, + "mean_token_accuracy": 0.9709506019949913, + "num_tokens": 10377086.0, + "step": 3700 + }, + { + "epoch": 8.916767189384801, + "eval_entropy": 0.26106021611878044, + "eval_loss": 0.7824530005455017, + "eval_mean_token_accuracy": 0.8467274777005228, + "eval_num_tokens": 10377086.0, + "eval_runtime": 55.1677, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3700 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.587241229250867e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..96b4d58444f3c4523955ceaf9f94c79393ee3797 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/trainer_state.json @@ -0,0 +1,3940 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.965018094089265, + "eval_steps": 20, + "global_step": 3720, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + }, + { + "entropy": 0.15672272052615882, + "epoch": 8.916767189384801, + "grad_norm": 0.610146701335907, + "learning_rate": 8.429930072725457e-06, + "loss": 0.09335047006607056, + "mean_token_accuracy": 0.9709506019949913, + "num_tokens": 10377086.0, + "step": 3700 + }, + { + "epoch": 8.916767189384801, + "eval_entropy": 0.26106021611878044, + "eval_loss": 0.7824530005455017, + "eval_mean_token_accuracy": 0.8467274777005228, + "eval_num_tokens": 10377086.0, + "eval_runtime": 55.1677, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3700 + }, + { + "entropy": 0.15496804118156432, + "epoch": 8.965018094089265, + "grad_norm": 0.8847843408584595, + "learning_rate": 7.706871844646178e-06, + "loss": 0.09552072882652282, + "mean_token_accuracy": 0.9693930178880692, + "num_tokens": 10435272.0, + "step": 3720 + }, + { + "epoch": 8.965018094089265, + "eval_entropy": 0.26175538701622675, + "eval_loss": 0.7833470106124878, + "eval_mean_token_accuracy": 0.8464220507761065, + "eval_num_tokens": 10435272.0, + "eval_runtime": 55.1496, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3720 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.618293614111437e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..de846c80f2f0466d1512a59f1b376d26411815eb --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/trainer_state.json @@ -0,0 +1,3961 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.012062726176115, + "eval_steps": 20, + "global_step": 3740, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + }, + { + "entropy": 0.15672272052615882, + "epoch": 8.916767189384801, + "grad_norm": 0.610146701335907, + "learning_rate": 8.429930072725457e-06, + "loss": 0.09335047006607056, + "mean_token_accuracy": 0.9709506019949913, + "num_tokens": 10377086.0, + "step": 3700 + }, + { + "epoch": 8.916767189384801, + "eval_entropy": 0.26106021611878044, + "eval_loss": 0.7824530005455017, + "eval_mean_token_accuracy": 0.8467274777005228, + "eval_num_tokens": 10377086.0, + "eval_runtime": 55.1677, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3700 + }, + { + "entropy": 0.15496804118156432, + "epoch": 8.965018094089265, + "grad_norm": 0.8847843408584595, + "learning_rate": 7.706871844646178e-06, + "loss": 0.09552072882652282, + "mean_token_accuracy": 0.9693930178880692, + "num_tokens": 10435272.0, + "step": 3720 + }, + { + "epoch": 8.965018094089265, + "eval_entropy": 0.26175538701622675, + "eval_loss": 0.7833470106124878, + "eval_mean_token_accuracy": 0.8464220507761065, + "eval_num_tokens": 10435272.0, + "eval_runtime": 55.1496, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3720 + }, + { + "entropy": 0.15739625711471605, + "epoch": 9.012062726176115, + "grad_norm": 0.35802221298217773, + "learning_rate": 7.0151882575034775e-06, + "loss": 0.09389110803604125, + "mean_token_accuracy": 0.9715636464265677, + "num_tokens": 10487076.0, + "step": 3740 + }, + { + "epoch": 9.012062726176115, + "eval_entropy": 0.26084648198291155, + "eval_loss": 0.7853291034698486, + "eval_mean_token_accuracy": 0.8463665585169632, + "eval_num_tokens": 10487076.0, + "eval_runtime": 55.1572, + "eval_samples_per_second": 25.745, + "eval_steps_per_second": 3.227, + "step": 3740 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.6462551039985664e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d8764653500cd0c471cc5af4b34dd2fb9ce4edd0 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/trainer_state.json @@ -0,0 +1,3982 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.060313630880579, + "eval_steps": 20, + "global_step": 3760, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + }, + { + "entropy": 0.15672272052615882, + "epoch": 8.916767189384801, + "grad_norm": 0.610146701335907, + "learning_rate": 8.429930072725457e-06, + "loss": 0.09335047006607056, + "mean_token_accuracy": 0.9709506019949913, + "num_tokens": 10377086.0, + "step": 3700 + }, + { + "epoch": 8.916767189384801, + "eval_entropy": 0.26106021611878044, + "eval_loss": 0.7824530005455017, + "eval_mean_token_accuracy": 0.8467274777005228, + "eval_num_tokens": 10377086.0, + "eval_runtime": 55.1677, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3700 + }, + { + "entropy": 0.15496804118156432, + "epoch": 8.965018094089265, + "grad_norm": 0.8847843408584595, + "learning_rate": 7.706871844646178e-06, + "loss": 0.09552072882652282, + "mean_token_accuracy": 0.9693930178880692, + "num_tokens": 10435272.0, + "step": 3720 + }, + { + "epoch": 8.965018094089265, + "eval_entropy": 0.26175538701622675, + "eval_loss": 0.7833470106124878, + "eval_mean_token_accuracy": 0.8464220507761065, + "eval_num_tokens": 10435272.0, + "eval_runtime": 55.1496, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3720 + }, + { + "entropy": 0.15739625711471605, + "epoch": 9.012062726176115, + "grad_norm": 0.35802221298217773, + "learning_rate": 7.0151882575034775e-06, + "loss": 0.09389110803604125, + "mean_token_accuracy": 0.9715636464265677, + "num_tokens": 10487076.0, + "step": 3740 + }, + { + "epoch": 9.012062726176115, + "eval_entropy": 0.26084648198291155, + "eval_loss": 0.7853291034698486, + "eval_mean_token_accuracy": 0.8463665585169632, + "eval_num_tokens": 10487076.0, + "eval_runtime": 55.1572, + "eval_samples_per_second": 25.745, + "eval_steps_per_second": 3.227, + "step": 3740 + }, + { + "entropy": 0.14215838070958853, + "epoch": 9.060313630880579, + "grad_norm": 0.4684004783630371, + "learning_rate": 6.35507504957069e-06, + "loss": 0.07583575248718262, + "mean_token_accuracy": 0.9782516390085221, + "num_tokens": 10543520.0, + "step": 3760 + }, + { + "epoch": 9.060313630880579, + "eval_entropy": 0.254280548333452, + "eval_loss": 0.807111382484436, + "eval_mean_token_accuracy": 0.8463215992022096, + "eval_num_tokens": 10543520.0, + "eval_runtime": 55.1805, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3760 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.6746693385017344e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ad4a6d9c85e62edaf15c1822bb2b9288fa4b6916 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/trainer_state.json @@ -0,0 +1,4003 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.108564535585042, + "eval_steps": 20, + "global_step": 3780, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + }, + { + "entropy": 0.15672272052615882, + "epoch": 8.916767189384801, + "grad_norm": 0.610146701335907, + "learning_rate": 8.429930072725457e-06, + "loss": 0.09335047006607056, + "mean_token_accuracy": 0.9709506019949913, + "num_tokens": 10377086.0, + "step": 3700 + }, + { + "epoch": 8.916767189384801, + "eval_entropy": 0.26106021611878044, + "eval_loss": 0.7824530005455017, + "eval_mean_token_accuracy": 0.8467274777005228, + "eval_num_tokens": 10377086.0, + "eval_runtime": 55.1677, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3700 + }, + { + "entropy": 0.15496804118156432, + "epoch": 8.965018094089265, + "grad_norm": 0.8847843408584595, + "learning_rate": 7.706871844646178e-06, + "loss": 0.09552072882652282, + "mean_token_accuracy": 0.9693930178880692, + "num_tokens": 10435272.0, + "step": 3720 + }, + { + "epoch": 8.965018094089265, + "eval_entropy": 0.26175538701622675, + "eval_loss": 0.7833470106124878, + "eval_mean_token_accuracy": 0.8464220507761065, + "eval_num_tokens": 10435272.0, + "eval_runtime": 55.1496, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3720 + }, + { + "entropy": 0.15739625711471605, + "epoch": 9.012062726176115, + "grad_norm": 0.35802221298217773, + "learning_rate": 7.0151882575034775e-06, + "loss": 0.09389110803604125, + "mean_token_accuracy": 0.9715636464265677, + "num_tokens": 10487076.0, + "step": 3740 + }, + { + "epoch": 9.012062726176115, + "eval_entropy": 0.26084648198291155, + "eval_loss": 0.7853291034698486, + "eval_mean_token_accuracy": 0.8463665585169632, + "eval_num_tokens": 10487076.0, + "eval_runtime": 55.1572, + "eval_samples_per_second": 25.745, + "eval_steps_per_second": 3.227, + "step": 3740 + }, + { + "entropy": 0.14215838070958853, + "epoch": 9.060313630880579, + "grad_norm": 0.4684004783630371, + "learning_rate": 6.35507504957069e-06, + "loss": 0.07583575248718262, + "mean_token_accuracy": 0.9782516390085221, + "num_tokens": 10543520.0, + "step": 3760 + }, + { + "epoch": 9.060313630880579, + "eval_entropy": 0.254280548333452, + "eval_loss": 0.807111382484436, + "eval_mean_token_accuracy": 0.8463215992022096, + "eval_num_tokens": 10543520.0, + "eval_runtime": 55.1805, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3760 + }, + { + "entropy": 0.1408092312514782, + "epoch": 9.108564535585042, + "grad_norm": 0.43550431728363037, + "learning_rate": 5.726719025077231e-06, + "loss": 0.07781847715377807, + "mean_token_accuracy": 0.9768255725502968, + "num_tokens": 10596528.0, + "step": 3780 + }, + { + "epoch": 9.108564535585042, + "eval_entropy": 0.2505798229340757, + "eval_loss": 0.8224650025367737, + "eval_mean_token_accuracy": 0.8463811020502884, + "eval_num_tokens": 10596528.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3780 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.701782977198285e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6889fb4a6e581cf454b09fd4b4163c5a37b39aca --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/trainer_state.json @@ -0,0 +1,433 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.916767189384801, + "eval_steps": 20, + "global_step": 380, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.782477115265024e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c78793f469ccb48f644f68eeaca77830e9c16d1a --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/trainer_state.json @@ -0,0 +1,4024 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.156815440289506, + "eval_steps": 20, + "global_step": 3800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + }, + { + "entropy": 0.15672272052615882, + "epoch": 8.916767189384801, + "grad_norm": 0.610146701335907, + "learning_rate": 8.429930072725457e-06, + "loss": 0.09335047006607056, + "mean_token_accuracy": 0.9709506019949913, + "num_tokens": 10377086.0, + "step": 3700 + }, + { + "epoch": 8.916767189384801, + "eval_entropy": 0.26106021611878044, + "eval_loss": 0.7824530005455017, + "eval_mean_token_accuracy": 0.8467274777005228, + "eval_num_tokens": 10377086.0, + "eval_runtime": 55.1677, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3700 + }, + { + "entropy": 0.15496804118156432, + "epoch": 8.965018094089265, + "grad_norm": 0.8847843408584595, + "learning_rate": 7.706871844646178e-06, + "loss": 0.09552072882652282, + "mean_token_accuracy": 0.9693930178880692, + "num_tokens": 10435272.0, + "step": 3720 + }, + { + "epoch": 8.965018094089265, + "eval_entropy": 0.26175538701622675, + "eval_loss": 0.7833470106124878, + "eval_mean_token_accuracy": 0.8464220507761065, + "eval_num_tokens": 10435272.0, + "eval_runtime": 55.1496, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3720 + }, + { + "entropy": 0.15739625711471605, + "epoch": 9.012062726176115, + "grad_norm": 0.35802221298217773, + "learning_rate": 7.0151882575034775e-06, + "loss": 0.09389110803604125, + "mean_token_accuracy": 0.9715636464265677, + "num_tokens": 10487076.0, + "step": 3740 + }, + { + "epoch": 9.012062726176115, + "eval_entropy": 0.26084648198291155, + "eval_loss": 0.7853291034698486, + "eval_mean_token_accuracy": 0.8463665585169632, + "eval_num_tokens": 10487076.0, + "eval_runtime": 55.1572, + "eval_samples_per_second": 25.745, + "eval_steps_per_second": 3.227, + "step": 3740 + }, + { + "entropy": 0.14215838070958853, + "epoch": 9.060313630880579, + "grad_norm": 0.4684004783630371, + "learning_rate": 6.35507504957069e-06, + "loss": 0.07583575248718262, + "mean_token_accuracy": 0.9782516390085221, + "num_tokens": 10543520.0, + "step": 3760 + }, + { + "epoch": 9.060313630880579, + "eval_entropy": 0.254280548333452, + "eval_loss": 0.807111382484436, + "eval_mean_token_accuracy": 0.8463215992022096, + "eval_num_tokens": 10543520.0, + "eval_runtime": 55.1805, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3760 + }, + { + "entropy": 0.1408092312514782, + "epoch": 9.108564535585042, + "grad_norm": 0.43550431728363037, + "learning_rate": 5.726719025077231e-06, + "loss": 0.07781847715377807, + "mean_token_accuracy": 0.9768255725502968, + "num_tokens": 10596528.0, + "step": 3780 + }, + { + "epoch": 9.108564535585042, + "eval_entropy": 0.2505798229340757, + "eval_loss": 0.8224650025367737, + "eval_mean_token_accuracy": 0.8463811020502884, + "eval_num_tokens": 10596528.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3780 + }, + { + "entropy": 0.13897503707557918, + "epoch": 9.156815440289506, + "grad_norm": 0.5255228281021118, + "learning_rate": 5.130298001345343e-06, + "loss": 0.07528382539749146, + "mean_token_accuracy": 0.9768329098820686, + "num_tokens": 10655544.0, + "step": 3800 + }, + { + "epoch": 9.156815440289506, + "eval_entropy": 0.2509520294960965, + "eval_loss": 0.8247353434562683, + "eval_mean_token_accuracy": 0.845786559112956, + "eval_num_tokens": 10655544.0, + "eval_runtime": 55.151, + "eval_samples_per_second": 25.747, + "eval_steps_per_second": 3.228, + "step": 3800 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.734250353978368e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6b63403682192b7dd91cfba91d98862a3d8f45c4 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/trainer_state.json @@ -0,0 +1,4045 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.205066344993968, + "eval_steps": 20, + "global_step": 3820, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + }, + { + "entropy": 0.15672272052615882, + "epoch": 8.916767189384801, + "grad_norm": 0.610146701335907, + "learning_rate": 8.429930072725457e-06, + "loss": 0.09335047006607056, + "mean_token_accuracy": 0.9709506019949913, + "num_tokens": 10377086.0, + "step": 3700 + }, + { + "epoch": 8.916767189384801, + "eval_entropy": 0.26106021611878044, + "eval_loss": 0.7824530005455017, + "eval_mean_token_accuracy": 0.8467274777005228, + "eval_num_tokens": 10377086.0, + "eval_runtime": 55.1677, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3700 + }, + { + "entropy": 0.15496804118156432, + "epoch": 8.965018094089265, + "grad_norm": 0.8847843408584595, + "learning_rate": 7.706871844646178e-06, + "loss": 0.09552072882652282, + "mean_token_accuracy": 0.9693930178880692, + "num_tokens": 10435272.0, + "step": 3720 + }, + { + "epoch": 8.965018094089265, + "eval_entropy": 0.26175538701622675, + "eval_loss": 0.7833470106124878, + "eval_mean_token_accuracy": 0.8464220507761065, + "eval_num_tokens": 10435272.0, + "eval_runtime": 55.1496, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3720 + }, + { + "entropy": 0.15739625711471605, + "epoch": 9.012062726176115, + "grad_norm": 0.35802221298217773, + "learning_rate": 7.0151882575034775e-06, + "loss": 0.09389110803604125, + "mean_token_accuracy": 0.9715636464265677, + "num_tokens": 10487076.0, + "step": 3740 + }, + { + "epoch": 9.012062726176115, + "eval_entropy": 0.26084648198291155, + "eval_loss": 0.7853291034698486, + "eval_mean_token_accuracy": 0.8463665585169632, + "eval_num_tokens": 10487076.0, + "eval_runtime": 55.1572, + "eval_samples_per_second": 25.745, + "eval_steps_per_second": 3.227, + "step": 3740 + }, + { + "entropy": 0.14215838070958853, + "epoch": 9.060313630880579, + "grad_norm": 0.4684004783630371, + "learning_rate": 6.35507504957069e-06, + "loss": 0.07583575248718262, + "mean_token_accuracy": 0.9782516390085221, + "num_tokens": 10543520.0, + "step": 3760 + }, + { + "epoch": 9.060313630880579, + "eval_entropy": 0.254280548333452, + "eval_loss": 0.807111382484436, + "eval_mean_token_accuracy": 0.8463215992022096, + "eval_num_tokens": 10543520.0, + "eval_runtime": 55.1805, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3760 + }, + { + "entropy": 0.1408092312514782, + "epoch": 9.108564535585042, + "grad_norm": 0.43550431728363037, + "learning_rate": 5.726719025077231e-06, + "loss": 0.07781847715377807, + "mean_token_accuracy": 0.9768255725502968, + "num_tokens": 10596528.0, + "step": 3780 + }, + { + "epoch": 9.108564535585042, + "eval_entropy": 0.2505798229340757, + "eval_loss": 0.8224650025367737, + "eval_mean_token_accuracy": 0.8463811020502884, + "eval_num_tokens": 10596528.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3780 + }, + { + "entropy": 0.13897503707557918, + "epoch": 9.156815440289506, + "grad_norm": 0.5255228281021118, + "learning_rate": 5.130298001345343e-06, + "loss": 0.07528382539749146, + "mean_token_accuracy": 0.9768329098820686, + "num_tokens": 10655544.0, + "step": 3800 + }, + { + "epoch": 9.156815440289506, + "eval_entropy": 0.2509520294960965, + "eval_loss": 0.8247353434562683, + "eval_mean_token_accuracy": 0.845786559112956, + "eval_num_tokens": 10655544.0, + "eval_runtime": 55.151, + "eval_samples_per_second": 25.747, + "eval_steps_per_second": 3.228, + "step": 3800 + }, + { + "entropy": 0.13998530581593513, + "epoch": 9.205066344993968, + "grad_norm": 0.4610442817211151, + "learning_rate": 4.565980758469731e-06, + "loss": 0.07813523411750793, + "mean_token_accuracy": 0.9769201070070267, + "num_tokens": 10710737.0, + "step": 3820 + }, + { + "epoch": 9.205066344993968, + "eval_entropy": 0.2503350620691696, + "eval_loss": 0.8271610736846924, + "eval_mean_token_accuracy": 0.8455204879969693, + "eval_num_tokens": 10710737.0, + "eval_runtime": 55.1706, + "eval_samples_per_second": 25.738, + "eval_steps_per_second": 3.226, + "step": 3820 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.766591015064166e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7da928a01a9ad86a79c29a0d1cd90c70bd6a6e3e --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/trainer_state.json @@ -0,0 +1,4066 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.253317249698432, + "eval_steps": 20, + "global_step": 3840, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + }, + { + "entropy": 0.15672272052615882, + "epoch": 8.916767189384801, + "grad_norm": 0.610146701335907, + "learning_rate": 8.429930072725457e-06, + "loss": 0.09335047006607056, + "mean_token_accuracy": 0.9709506019949913, + "num_tokens": 10377086.0, + "step": 3700 + }, + { + "epoch": 8.916767189384801, + "eval_entropy": 0.26106021611878044, + "eval_loss": 0.7824530005455017, + "eval_mean_token_accuracy": 0.8467274777005228, + "eval_num_tokens": 10377086.0, + "eval_runtime": 55.1677, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3700 + }, + { + "entropy": 0.15496804118156432, + "epoch": 8.965018094089265, + "grad_norm": 0.8847843408584595, + "learning_rate": 7.706871844646178e-06, + "loss": 0.09552072882652282, + "mean_token_accuracy": 0.9693930178880692, + "num_tokens": 10435272.0, + "step": 3720 + }, + { + "epoch": 8.965018094089265, + "eval_entropy": 0.26175538701622675, + "eval_loss": 0.7833470106124878, + "eval_mean_token_accuracy": 0.8464220507761065, + "eval_num_tokens": 10435272.0, + "eval_runtime": 55.1496, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3720 + }, + { + "entropy": 0.15739625711471605, + "epoch": 9.012062726176115, + "grad_norm": 0.35802221298217773, + "learning_rate": 7.0151882575034775e-06, + "loss": 0.09389110803604125, + "mean_token_accuracy": 0.9715636464265677, + "num_tokens": 10487076.0, + "step": 3740 + }, + { + "epoch": 9.012062726176115, + "eval_entropy": 0.26084648198291155, + "eval_loss": 0.7853291034698486, + "eval_mean_token_accuracy": 0.8463665585169632, + "eval_num_tokens": 10487076.0, + "eval_runtime": 55.1572, + "eval_samples_per_second": 25.745, + "eval_steps_per_second": 3.227, + "step": 3740 + }, + { + "entropy": 0.14215838070958853, + "epoch": 9.060313630880579, + "grad_norm": 0.4684004783630371, + "learning_rate": 6.35507504957069e-06, + "loss": 0.07583575248718262, + "mean_token_accuracy": 0.9782516390085221, + "num_tokens": 10543520.0, + "step": 3760 + }, + { + "epoch": 9.060313630880579, + "eval_entropy": 0.254280548333452, + "eval_loss": 0.807111382484436, + "eval_mean_token_accuracy": 0.8463215992022096, + "eval_num_tokens": 10543520.0, + "eval_runtime": 55.1805, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3760 + }, + { + "entropy": 0.1408092312514782, + "epoch": 9.108564535585042, + "grad_norm": 0.43550431728363037, + "learning_rate": 5.726719025077231e-06, + "loss": 0.07781847715377807, + "mean_token_accuracy": 0.9768255725502968, + "num_tokens": 10596528.0, + "step": 3780 + }, + { + "epoch": 9.108564535585042, + "eval_entropy": 0.2505798229340757, + "eval_loss": 0.8224650025367737, + "eval_mean_token_accuracy": 0.8463811020502884, + "eval_num_tokens": 10596528.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3780 + }, + { + "entropy": 0.13897503707557918, + "epoch": 9.156815440289506, + "grad_norm": 0.5255228281021118, + "learning_rate": 5.130298001345343e-06, + "loss": 0.07528382539749146, + "mean_token_accuracy": 0.9768329098820686, + "num_tokens": 10655544.0, + "step": 3800 + }, + { + "epoch": 9.156815440289506, + "eval_entropy": 0.2509520294960965, + "eval_loss": 0.8247353434562683, + "eval_mean_token_accuracy": 0.845786559112956, + "eval_num_tokens": 10655544.0, + "eval_runtime": 55.151, + "eval_samples_per_second": 25.747, + "eval_steps_per_second": 3.228, + "step": 3800 + }, + { + "entropy": 0.13998530581593513, + "epoch": 9.205066344993968, + "grad_norm": 0.4610442817211151, + "learning_rate": 4.565980758469731e-06, + "loss": 0.07813523411750793, + "mean_token_accuracy": 0.9769201070070267, + "num_tokens": 10710737.0, + "step": 3820 + }, + { + "epoch": 9.205066344993968, + "eval_entropy": 0.2503350620691696, + "eval_loss": 0.8271610736846924, + "eval_mean_token_accuracy": 0.8455204879969693, + "eval_num_tokens": 10710737.0, + "eval_runtime": 55.1706, + "eval_samples_per_second": 25.738, + "eval_steps_per_second": 3.226, + "step": 3820 + }, + { + "entropy": 0.13162653651088477, + "epoch": 9.253317249698432, + "grad_norm": 0.3627360463142395, + "learning_rate": 4.033926991554922e-06, + "loss": 0.07141604423522949, + "mean_token_accuracy": 0.9792275875806808, + "num_tokens": 10771727.0, + "step": 3840 + }, + { + "epoch": 9.253317249698432, + "eval_entropy": 0.24897396087311627, + "eval_loss": 0.8307036757469177, + "eval_mean_token_accuracy": 0.8453606835911783, + "eval_num_tokens": 10771727.0, + "eval_runtime": 55.168, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3840 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.798039386469376e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f6b5d112b99ffb7d1d8d017f9427e64934ed3e59 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/trainer_state.json @@ -0,0 +1,4087 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.301568154402895, + "eval_steps": 20, + "global_step": 3860, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + }, + { + "entropy": 0.15672272052615882, + "epoch": 8.916767189384801, + "grad_norm": 0.610146701335907, + "learning_rate": 8.429930072725457e-06, + "loss": 0.09335047006607056, + "mean_token_accuracy": 0.9709506019949913, + "num_tokens": 10377086.0, + "step": 3700 + }, + { + "epoch": 8.916767189384801, + "eval_entropy": 0.26106021611878044, + "eval_loss": 0.7824530005455017, + "eval_mean_token_accuracy": 0.8467274777005228, + "eval_num_tokens": 10377086.0, + "eval_runtime": 55.1677, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3700 + }, + { + "entropy": 0.15496804118156432, + "epoch": 8.965018094089265, + "grad_norm": 0.8847843408584595, + "learning_rate": 7.706871844646178e-06, + "loss": 0.09552072882652282, + "mean_token_accuracy": 0.9693930178880692, + "num_tokens": 10435272.0, + "step": 3720 + }, + { + "epoch": 8.965018094089265, + "eval_entropy": 0.26175538701622675, + "eval_loss": 0.7833470106124878, + "eval_mean_token_accuracy": 0.8464220507761065, + "eval_num_tokens": 10435272.0, + "eval_runtime": 55.1496, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3720 + }, + { + "entropy": 0.15739625711471605, + "epoch": 9.012062726176115, + "grad_norm": 0.35802221298217773, + "learning_rate": 7.0151882575034775e-06, + "loss": 0.09389110803604125, + "mean_token_accuracy": 0.9715636464265677, + "num_tokens": 10487076.0, + "step": 3740 + }, + { + "epoch": 9.012062726176115, + "eval_entropy": 0.26084648198291155, + "eval_loss": 0.7853291034698486, + "eval_mean_token_accuracy": 0.8463665585169632, + "eval_num_tokens": 10487076.0, + "eval_runtime": 55.1572, + "eval_samples_per_second": 25.745, + "eval_steps_per_second": 3.227, + "step": 3740 + }, + { + "entropy": 0.14215838070958853, + "epoch": 9.060313630880579, + "grad_norm": 0.4684004783630371, + "learning_rate": 6.35507504957069e-06, + "loss": 0.07583575248718262, + "mean_token_accuracy": 0.9782516390085221, + "num_tokens": 10543520.0, + "step": 3760 + }, + { + "epoch": 9.060313630880579, + "eval_entropy": 0.254280548333452, + "eval_loss": 0.807111382484436, + "eval_mean_token_accuracy": 0.8463215992022096, + "eval_num_tokens": 10543520.0, + "eval_runtime": 55.1805, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3760 + }, + { + "entropy": 0.1408092312514782, + "epoch": 9.108564535585042, + "grad_norm": 0.43550431728363037, + "learning_rate": 5.726719025077231e-06, + "loss": 0.07781847715377807, + "mean_token_accuracy": 0.9768255725502968, + "num_tokens": 10596528.0, + "step": 3780 + }, + { + "epoch": 9.108564535585042, + "eval_entropy": 0.2505798229340757, + "eval_loss": 0.8224650025367737, + "eval_mean_token_accuracy": 0.8463811020502884, + "eval_num_tokens": 10596528.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3780 + }, + { + "entropy": 0.13897503707557918, + "epoch": 9.156815440289506, + "grad_norm": 0.5255228281021118, + "learning_rate": 5.130298001345343e-06, + "loss": 0.07528382539749146, + "mean_token_accuracy": 0.9768329098820686, + "num_tokens": 10655544.0, + "step": 3800 + }, + { + "epoch": 9.156815440289506, + "eval_entropy": 0.2509520294960965, + "eval_loss": 0.8247353434562683, + "eval_mean_token_accuracy": 0.845786559112956, + "eval_num_tokens": 10655544.0, + "eval_runtime": 55.151, + "eval_samples_per_second": 25.747, + "eval_steps_per_second": 3.228, + "step": 3800 + }, + { + "entropy": 0.13998530581593513, + "epoch": 9.205066344993968, + "grad_norm": 0.4610442817211151, + "learning_rate": 4.565980758469731e-06, + "loss": 0.07813523411750793, + "mean_token_accuracy": 0.9769201070070267, + "num_tokens": 10710737.0, + "step": 3820 + }, + { + "epoch": 9.205066344993968, + "eval_entropy": 0.2503350620691696, + "eval_loss": 0.8271610736846924, + "eval_mean_token_accuracy": 0.8455204879969693, + "eval_num_tokens": 10710737.0, + "eval_runtime": 55.1706, + "eval_samples_per_second": 25.738, + "eval_steps_per_second": 3.226, + "step": 3820 + }, + { + "entropy": 0.13162653651088477, + "epoch": 9.253317249698432, + "grad_norm": 0.3627360463142395, + "learning_rate": 4.033926991554922e-06, + "loss": 0.07141604423522949, + "mean_token_accuracy": 0.9792275875806808, + "num_tokens": 10771727.0, + "step": 3840 + }, + { + "epoch": 9.253317249698432, + "eval_entropy": 0.24897396087311627, + "eval_loss": 0.8307036757469177, + "eval_mean_token_accuracy": 0.8453606835911783, + "eval_num_tokens": 10771727.0, + "eval_runtime": 55.168, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3840 + }, + { + "entropy": 0.14422765467315912, + "epoch": 9.301568154402895, + "grad_norm": 0.5706251263618469, + "learning_rate": 3.53428726552335e-06, + "loss": 0.07931464314460754, + "mean_token_accuracy": 0.9752222061157226, + "num_tokens": 10825849.0, + "step": 3860 + }, + { + "epoch": 9.301568154402895, + "eval_entropy": 0.24914598389622872, + "eval_loss": 0.8312752842903137, + "eval_mean_token_accuracy": 0.8455510604917333, + "eval_num_tokens": 10825849.0, + "eval_runtime": 55.1849, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3860 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.827893252054835e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1f64c7d92ca51d624b16eb0cd736965cbab817ef --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/trainer_state.json @@ -0,0 +1,4108 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.349819059107359, + "eval_steps": 20, + "global_step": 3880, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + }, + { + "entropy": 0.15672272052615882, + "epoch": 8.916767189384801, + "grad_norm": 0.610146701335907, + "learning_rate": 8.429930072725457e-06, + "loss": 0.09335047006607056, + "mean_token_accuracy": 0.9709506019949913, + "num_tokens": 10377086.0, + "step": 3700 + }, + { + "epoch": 8.916767189384801, + "eval_entropy": 0.26106021611878044, + "eval_loss": 0.7824530005455017, + "eval_mean_token_accuracy": 0.8467274777005228, + "eval_num_tokens": 10377086.0, + "eval_runtime": 55.1677, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3700 + }, + { + "entropy": 0.15496804118156432, + "epoch": 8.965018094089265, + "grad_norm": 0.8847843408584595, + "learning_rate": 7.706871844646178e-06, + "loss": 0.09552072882652282, + "mean_token_accuracy": 0.9693930178880692, + "num_tokens": 10435272.0, + "step": 3720 + }, + { + "epoch": 8.965018094089265, + "eval_entropy": 0.26175538701622675, + "eval_loss": 0.7833470106124878, + "eval_mean_token_accuracy": 0.8464220507761065, + "eval_num_tokens": 10435272.0, + "eval_runtime": 55.1496, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3720 + }, + { + "entropy": 0.15739625711471605, + "epoch": 9.012062726176115, + "grad_norm": 0.35802221298217773, + "learning_rate": 7.0151882575034775e-06, + "loss": 0.09389110803604125, + "mean_token_accuracy": 0.9715636464265677, + "num_tokens": 10487076.0, + "step": 3740 + }, + { + "epoch": 9.012062726176115, + "eval_entropy": 0.26084648198291155, + "eval_loss": 0.7853291034698486, + "eval_mean_token_accuracy": 0.8463665585169632, + "eval_num_tokens": 10487076.0, + "eval_runtime": 55.1572, + "eval_samples_per_second": 25.745, + "eval_steps_per_second": 3.227, + "step": 3740 + }, + { + "entropy": 0.14215838070958853, + "epoch": 9.060313630880579, + "grad_norm": 0.4684004783630371, + "learning_rate": 6.35507504957069e-06, + "loss": 0.07583575248718262, + "mean_token_accuracy": 0.9782516390085221, + "num_tokens": 10543520.0, + "step": 3760 + }, + { + "epoch": 9.060313630880579, + "eval_entropy": 0.254280548333452, + "eval_loss": 0.807111382484436, + "eval_mean_token_accuracy": 0.8463215992022096, + "eval_num_tokens": 10543520.0, + "eval_runtime": 55.1805, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3760 + }, + { + "entropy": 0.1408092312514782, + "epoch": 9.108564535585042, + "grad_norm": 0.43550431728363037, + "learning_rate": 5.726719025077231e-06, + "loss": 0.07781847715377807, + "mean_token_accuracy": 0.9768255725502968, + "num_tokens": 10596528.0, + "step": 3780 + }, + { + "epoch": 9.108564535585042, + "eval_entropy": 0.2505798229340757, + "eval_loss": 0.8224650025367737, + "eval_mean_token_accuracy": 0.8463811020502884, + "eval_num_tokens": 10596528.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3780 + }, + { + "entropy": 0.13897503707557918, + "epoch": 9.156815440289506, + "grad_norm": 0.5255228281021118, + "learning_rate": 5.130298001345343e-06, + "loss": 0.07528382539749146, + "mean_token_accuracy": 0.9768329098820686, + "num_tokens": 10655544.0, + "step": 3800 + }, + { + "epoch": 9.156815440289506, + "eval_entropy": 0.2509520294960965, + "eval_loss": 0.8247353434562683, + "eval_mean_token_accuracy": 0.845786559112956, + "eval_num_tokens": 10655544.0, + "eval_runtime": 55.151, + "eval_samples_per_second": 25.747, + "eval_steps_per_second": 3.228, + "step": 3800 + }, + { + "entropy": 0.13998530581593513, + "epoch": 9.205066344993968, + "grad_norm": 0.4610442817211151, + "learning_rate": 4.565980758469731e-06, + "loss": 0.07813523411750793, + "mean_token_accuracy": 0.9769201070070267, + "num_tokens": 10710737.0, + "step": 3820 + }, + { + "epoch": 9.205066344993968, + "eval_entropy": 0.2503350620691696, + "eval_loss": 0.8271610736846924, + "eval_mean_token_accuracy": 0.8455204879969693, + "eval_num_tokens": 10710737.0, + "eval_runtime": 55.1706, + "eval_samples_per_second": 25.738, + "eval_steps_per_second": 3.226, + "step": 3820 + }, + { + "entropy": 0.13162653651088477, + "epoch": 9.253317249698432, + "grad_norm": 0.3627360463142395, + "learning_rate": 4.033926991554922e-06, + "loss": 0.07141604423522949, + "mean_token_accuracy": 0.9792275875806808, + "num_tokens": 10771727.0, + "step": 3840 + }, + { + "epoch": 9.253317249698432, + "eval_entropy": 0.24897396087311627, + "eval_loss": 0.8307036757469177, + "eval_mean_token_accuracy": 0.8453606835911783, + "eval_num_tokens": 10771727.0, + "eval_runtime": 55.168, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3840 + }, + { + "entropy": 0.14422765467315912, + "epoch": 9.301568154402895, + "grad_norm": 0.5706251263618469, + "learning_rate": 3.53428726552335e-06, + "loss": 0.07931464314460754, + "mean_token_accuracy": 0.9752222061157226, + "num_tokens": 10825849.0, + "step": 3860 + }, + { + "epoch": 9.301568154402895, + "eval_entropy": 0.24914598389622872, + "eval_loss": 0.8312752842903137, + "eval_mean_token_accuracy": 0.8455510604917333, + "eval_num_tokens": 10825849.0, + "eval_runtime": 55.1849, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3860 + }, + { + "entropy": 0.13540438804775476, + "epoch": 9.349819059107359, + "grad_norm": 0.6334630250930786, + "learning_rate": 3.0672029725073196e-06, + "loss": 0.07548695802688599, + "mean_token_accuracy": 0.9778543844819069, + "num_tokens": 10884611.0, + "step": 3880 + }, + { + "epoch": 9.349819059107359, + "eval_entropy": 0.24910795722114906, + "eval_loss": 0.8316059112548828, + "eval_mean_token_accuracy": 0.8456973557391864, + "eval_num_tokens": 10884611.0, + "eval_runtime": 55.1728, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3880 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.85832437802537e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ab81f60b899f34d88e31437457a50f64f888fe02 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/trainer_state.json @@ -0,0 +1,4129 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.398069963811821, + "eval_steps": 20, + "global_step": 3900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + }, + { + "entropy": 0.15672272052615882, + "epoch": 8.916767189384801, + "grad_norm": 0.610146701335907, + "learning_rate": 8.429930072725457e-06, + "loss": 0.09335047006607056, + "mean_token_accuracy": 0.9709506019949913, + "num_tokens": 10377086.0, + "step": 3700 + }, + { + "epoch": 8.916767189384801, + "eval_entropy": 0.26106021611878044, + "eval_loss": 0.7824530005455017, + "eval_mean_token_accuracy": 0.8467274777005228, + "eval_num_tokens": 10377086.0, + "eval_runtime": 55.1677, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3700 + }, + { + "entropy": 0.15496804118156432, + "epoch": 8.965018094089265, + "grad_norm": 0.8847843408584595, + "learning_rate": 7.706871844646178e-06, + "loss": 0.09552072882652282, + "mean_token_accuracy": 0.9693930178880692, + "num_tokens": 10435272.0, + "step": 3720 + }, + { + "epoch": 8.965018094089265, + "eval_entropy": 0.26175538701622675, + "eval_loss": 0.7833470106124878, + "eval_mean_token_accuracy": 0.8464220507761065, + "eval_num_tokens": 10435272.0, + "eval_runtime": 55.1496, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3720 + }, + { + "entropy": 0.15739625711471605, + "epoch": 9.012062726176115, + "grad_norm": 0.35802221298217773, + "learning_rate": 7.0151882575034775e-06, + "loss": 0.09389110803604125, + "mean_token_accuracy": 0.9715636464265677, + "num_tokens": 10487076.0, + "step": 3740 + }, + { + "epoch": 9.012062726176115, + "eval_entropy": 0.26084648198291155, + "eval_loss": 0.7853291034698486, + "eval_mean_token_accuracy": 0.8463665585169632, + "eval_num_tokens": 10487076.0, + "eval_runtime": 55.1572, + "eval_samples_per_second": 25.745, + "eval_steps_per_second": 3.227, + "step": 3740 + }, + { + "entropy": 0.14215838070958853, + "epoch": 9.060313630880579, + "grad_norm": 0.4684004783630371, + "learning_rate": 6.35507504957069e-06, + "loss": 0.07583575248718262, + "mean_token_accuracy": 0.9782516390085221, + "num_tokens": 10543520.0, + "step": 3760 + }, + { + "epoch": 9.060313630880579, + "eval_entropy": 0.254280548333452, + "eval_loss": 0.807111382484436, + "eval_mean_token_accuracy": 0.8463215992022096, + "eval_num_tokens": 10543520.0, + "eval_runtime": 55.1805, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3760 + }, + { + "entropy": 0.1408092312514782, + "epoch": 9.108564535585042, + "grad_norm": 0.43550431728363037, + "learning_rate": 5.726719025077231e-06, + "loss": 0.07781847715377807, + "mean_token_accuracy": 0.9768255725502968, + "num_tokens": 10596528.0, + "step": 3780 + }, + { + "epoch": 9.108564535585042, + "eval_entropy": 0.2505798229340757, + "eval_loss": 0.8224650025367737, + "eval_mean_token_accuracy": 0.8463811020502884, + "eval_num_tokens": 10596528.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3780 + }, + { + "entropy": 0.13897503707557918, + "epoch": 9.156815440289506, + "grad_norm": 0.5255228281021118, + "learning_rate": 5.130298001345343e-06, + "loss": 0.07528382539749146, + "mean_token_accuracy": 0.9768329098820686, + "num_tokens": 10655544.0, + "step": 3800 + }, + { + "epoch": 9.156815440289506, + "eval_entropy": 0.2509520294960965, + "eval_loss": 0.8247353434562683, + "eval_mean_token_accuracy": 0.845786559112956, + "eval_num_tokens": 10655544.0, + "eval_runtime": 55.151, + "eval_samples_per_second": 25.747, + "eval_steps_per_second": 3.228, + "step": 3800 + }, + { + "entropy": 0.13998530581593513, + "epoch": 9.205066344993968, + "grad_norm": 0.4610442817211151, + "learning_rate": 4.565980758469731e-06, + "loss": 0.07813523411750793, + "mean_token_accuracy": 0.9769201070070267, + "num_tokens": 10710737.0, + "step": 3820 + }, + { + "epoch": 9.205066344993968, + "eval_entropy": 0.2503350620691696, + "eval_loss": 0.8271610736846924, + "eval_mean_token_accuracy": 0.8455204879969693, + "eval_num_tokens": 10710737.0, + "eval_runtime": 55.1706, + "eval_samples_per_second": 25.738, + "eval_steps_per_second": 3.226, + "step": 3820 + }, + { + "entropy": 0.13162653651088477, + "epoch": 9.253317249698432, + "grad_norm": 0.3627360463142395, + "learning_rate": 4.033926991554922e-06, + "loss": 0.07141604423522949, + "mean_token_accuracy": 0.9792275875806808, + "num_tokens": 10771727.0, + "step": 3840 + }, + { + "epoch": 9.253317249698432, + "eval_entropy": 0.24897396087311627, + "eval_loss": 0.8307036757469177, + "eval_mean_token_accuracy": 0.8453606835911783, + "eval_num_tokens": 10771727.0, + "eval_runtime": 55.168, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3840 + }, + { + "entropy": 0.14422765467315912, + "epoch": 9.301568154402895, + "grad_norm": 0.5706251263618469, + "learning_rate": 3.53428726552335e-06, + "loss": 0.07931464314460754, + "mean_token_accuracy": 0.9752222061157226, + "num_tokens": 10825849.0, + "step": 3860 + }, + { + "epoch": 9.301568154402895, + "eval_entropy": 0.24914598389622872, + "eval_loss": 0.8312752842903137, + "eval_mean_token_accuracy": 0.8455510604917333, + "eval_num_tokens": 10825849.0, + "eval_runtime": 55.1849, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3860 + }, + { + "entropy": 0.13540438804775476, + "epoch": 9.349819059107359, + "grad_norm": 0.6334630250930786, + "learning_rate": 3.0672029725073196e-06, + "loss": 0.07548695802688599, + "mean_token_accuracy": 0.9778543844819069, + "num_tokens": 10884611.0, + "step": 3880 + }, + { + "epoch": 9.349819059107359, + "eval_entropy": 0.24910795722114906, + "eval_loss": 0.8316059112548828, + "eval_mean_token_accuracy": 0.8456973557391864, + "eval_num_tokens": 10884611.0, + "eval_runtime": 55.1728, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3880 + }, + { + "entropy": 0.1410717975348234, + "epoch": 9.398069963811821, + "grad_norm": 0.5009909868240356, + "learning_rate": 2.632806291836666e-06, + "loss": 0.07720760703086853, + "mean_token_accuracy": 0.9768648758530617, + "num_tokens": 10938300.0, + "step": 3900 + }, + { + "epoch": 9.398069963811821, + "eval_entropy": 0.24874219742048992, + "eval_loss": 0.8328408598899841, + "eval_mean_token_accuracy": 0.8455627888775943, + "eval_num_tokens": 10938300.0, + "eval_runtime": 55.1736, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3900 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.887157478295757e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d4d8a17085ce1fb264da87e7f4d6eea30f096053 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/trainer_state.json @@ -0,0 +1,4150 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.446320868516285, + "eval_steps": 20, + "global_step": 3920, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + }, + { + "entropy": 0.15672272052615882, + "epoch": 8.916767189384801, + "grad_norm": 0.610146701335907, + "learning_rate": 8.429930072725457e-06, + "loss": 0.09335047006607056, + "mean_token_accuracy": 0.9709506019949913, + "num_tokens": 10377086.0, + "step": 3700 + }, + { + "epoch": 8.916767189384801, + "eval_entropy": 0.26106021611878044, + "eval_loss": 0.7824530005455017, + "eval_mean_token_accuracy": 0.8467274777005228, + "eval_num_tokens": 10377086.0, + "eval_runtime": 55.1677, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3700 + }, + { + "entropy": 0.15496804118156432, + "epoch": 8.965018094089265, + "grad_norm": 0.8847843408584595, + "learning_rate": 7.706871844646178e-06, + "loss": 0.09552072882652282, + "mean_token_accuracy": 0.9693930178880692, + "num_tokens": 10435272.0, + "step": 3720 + }, + { + "epoch": 8.965018094089265, + "eval_entropy": 0.26175538701622675, + "eval_loss": 0.7833470106124878, + "eval_mean_token_accuracy": 0.8464220507761065, + "eval_num_tokens": 10435272.0, + "eval_runtime": 55.1496, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3720 + }, + { + "entropy": 0.15739625711471605, + "epoch": 9.012062726176115, + "grad_norm": 0.35802221298217773, + "learning_rate": 7.0151882575034775e-06, + "loss": 0.09389110803604125, + "mean_token_accuracy": 0.9715636464265677, + "num_tokens": 10487076.0, + "step": 3740 + }, + { + "epoch": 9.012062726176115, + "eval_entropy": 0.26084648198291155, + "eval_loss": 0.7853291034698486, + "eval_mean_token_accuracy": 0.8463665585169632, + "eval_num_tokens": 10487076.0, + "eval_runtime": 55.1572, + "eval_samples_per_second": 25.745, + "eval_steps_per_second": 3.227, + "step": 3740 + }, + { + "entropy": 0.14215838070958853, + "epoch": 9.060313630880579, + "grad_norm": 0.4684004783630371, + "learning_rate": 6.35507504957069e-06, + "loss": 0.07583575248718262, + "mean_token_accuracy": 0.9782516390085221, + "num_tokens": 10543520.0, + "step": 3760 + }, + { + "epoch": 9.060313630880579, + "eval_entropy": 0.254280548333452, + "eval_loss": 0.807111382484436, + "eval_mean_token_accuracy": 0.8463215992022096, + "eval_num_tokens": 10543520.0, + "eval_runtime": 55.1805, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3760 + }, + { + "entropy": 0.1408092312514782, + "epoch": 9.108564535585042, + "grad_norm": 0.43550431728363037, + "learning_rate": 5.726719025077231e-06, + "loss": 0.07781847715377807, + "mean_token_accuracy": 0.9768255725502968, + "num_tokens": 10596528.0, + "step": 3780 + }, + { + "epoch": 9.108564535585042, + "eval_entropy": 0.2505798229340757, + "eval_loss": 0.8224650025367737, + "eval_mean_token_accuracy": 0.8463811020502884, + "eval_num_tokens": 10596528.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3780 + }, + { + "entropy": 0.13897503707557918, + "epoch": 9.156815440289506, + "grad_norm": 0.5255228281021118, + "learning_rate": 5.130298001345343e-06, + "loss": 0.07528382539749146, + "mean_token_accuracy": 0.9768329098820686, + "num_tokens": 10655544.0, + "step": 3800 + }, + { + "epoch": 9.156815440289506, + "eval_entropy": 0.2509520294960965, + "eval_loss": 0.8247353434562683, + "eval_mean_token_accuracy": 0.845786559112956, + "eval_num_tokens": 10655544.0, + "eval_runtime": 55.151, + "eval_samples_per_second": 25.747, + "eval_steps_per_second": 3.228, + "step": 3800 + }, + { + "entropy": 0.13998530581593513, + "epoch": 9.205066344993968, + "grad_norm": 0.4610442817211151, + "learning_rate": 4.565980758469731e-06, + "loss": 0.07813523411750793, + "mean_token_accuracy": 0.9769201070070267, + "num_tokens": 10710737.0, + "step": 3820 + }, + { + "epoch": 9.205066344993968, + "eval_entropy": 0.2503350620691696, + "eval_loss": 0.8271610736846924, + "eval_mean_token_accuracy": 0.8455204879969693, + "eval_num_tokens": 10710737.0, + "eval_runtime": 55.1706, + "eval_samples_per_second": 25.738, + "eval_steps_per_second": 3.226, + "step": 3820 + }, + { + "entropy": 0.13162653651088477, + "epoch": 9.253317249698432, + "grad_norm": 0.3627360463142395, + "learning_rate": 4.033926991554922e-06, + "loss": 0.07141604423522949, + "mean_token_accuracy": 0.9792275875806808, + "num_tokens": 10771727.0, + "step": 3840 + }, + { + "epoch": 9.253317249698432, + "eval_entropy": 0.24897396087311627, + "eval_loss": 0.8307036757469177, + "eval_mean_token_accuracy": 0.8453606835911783, + "eval_num_tokens": 10771727.0, + "eval_runtime": 55.168, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3840 + }, + { + "entropy": 0.14422765467315912, + "epoch": 9.301568154402895, + "grad_norm": 0.5706251263618469, + "learning_rate": 3.53428726552335e-06, + "loss": 0.07931464314460754, + "mean_token_accuracy": 0.9752222061157226, + "num_tokens": 10825849.0, + "step": 3860 + }, + { + "epoch": 9.301568154402895, + "eval_entropy": 0.24914598389622872, + "eval_loss": 0.8312752842903137, + "eval_mean_token_accuracy": 0.8455510604917333, + "eval_num_tokens": 10825849.0, + "eval_runtime": 55.1849, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3860 + }, + { + "entropy": 0.13540438804775476, + "epoch": 9.349819059107359, + "grad_norm": 0.6334630250930786, + "learning_rate": 3.0672029725073196e-06, + "loss": 0.07548695802688599, + "mean_token_accuracy": 0.9778543844819069, + "num_tokens": 10884611.0, + "step": 3880 + }, + { + "epoch": 9.349819059107359, + "eval_entropy": 0.24910795722114906, + "eval_loss": 0.8316059112548828, + "eval_mean_token_accuracy": 0.8456973557391864, + "eval_num_tokens": 10884611.0, + "eval_runtime": 55.1728, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3880 + }, + { + "entropy": 0.1410717975348234, + "epoch": 9.398069963811821, + "grad_norm": 0.5009909868240356, + "learning_rate": 2.632806291836666e-06, + "loss": 0.07720760703086853, + "mean_token_accuracy": 0.9768648758530617, + "num_tokens": 10938300.0, + "step": 3900 + }, + { + "epoch": 9.398069963811821, + "eval_entropy": 0.24874219742048992, + "eval_loss": 0.8328408598899841, + "eval_mean_token_accuracy": 0.8455627888775943, + "eval_num_tokens": 10938300.0, + "eval_runtime": 55.1736, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3900 + }, + { + "entropy": 0.13674762714654207, + "epoch": 9.446320868516285, + "grad_norm": 0.46003177762031555, + "learning_rate": 2.231220152633621e-06, + "loss": 0.07583877444267273, + "mean_token_accuracy": 0.9772356480360032, + "num_tokens": 10994442.0, + "step": 3920 + }, + { + "epoch": 9.446320868516285, + "eval_entropy": 0.24896439030933915, + "eval_loss": 0.8331801891326904, + "eval_mean_token_accuracy": 0.8453999262177543, + "eval_num_tokens": 10994442.0, + "eval_runtime": 55.1815, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3920 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.916918067050701e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..11f3ef2a29d9b95de5a93587e2f4274fd037b534 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/trainer_state.json @@ -0,0 +1,4171 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.494571773220748, + "eval_steps": 20, + "global_step": 3940, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + }, + { + "entropy": 0.15672272052615882, + "epoch": 8.916767189384801, + "grad_norm": 0.610146701335907, + "learning_rate": 8.429930072725457e-06, + "loss": 0.09335047006607056, + "mean_token_accuracy": 0.9709506019949913, + "num_tokens": 10377086.0, + "step": 3700 + }, + { + "epoch": 8.916767189384801, + "eval_entropy": 0.26106021611878044, + "eval_loss": 0.7824530005455017, + "eval_mean_token_accuracy": 0.8467274777005228, + "eval_num_tokens": 10377086.0, + "eval_runtime": 55.1677, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3700 + }, + { + "entropy": 0.15496804118156432, + "epoch": 8.965018094089265, + "grad_norm": 0.8847843408584595, + "learning_rate": 7.706871844646178e-06, + "loss": 0.09552072882652282, + "mean_token_accuracy": 0.9693930178880692, + "num_tokens": 10435272.0, + "step": 3720 + }, + { + "epoch": 8.965018094089265, + "eval_entropy": 0.26175538701622675, + "eval_loss": 0.7833470106124878, + "eval_mean_token_accuracy": 0.8464220507761065, + "eval_num_tokens": 10435272.0, + "eval_runtime": 55.1496, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3720 + }, + { + "entropy": 0.15739625711471605, + "epoch": 9.012062726176115, + "grad_norm": 0.35802221298217773, + "learning_rate": 7.0151882575034775e-06, + "loss": 0.09389110803604125, + "mean_token_accuracy": 0.9715636464265677, + "num_tokens": 10487076.0, + "step": 3740 + }, + { + "epoch": 9.012062726176115, + "eval_entropy": 0.26084648198291155, + "eval_loss": 0.7853291034698486, + "eval_mean_token_accuracy": 0.8463665585169632, + "eval_num_tokens": 10487076.0, + "eval_runtime": 55.1572, + "eval_samples_per_second": 25.745, + "eval_steps_per_second": 3.227, + "step": 3740 + }, + { + "entropy": 0.14215838070958853, + "epoch": 9.060313630880579, + "grad_norm": 0.4684004783630371, + "learning_rate": 6.35507504957069e-06, + "loss": 0.07583575248718262, + "mean_token_accuracy": 0.9782516390085221, + "num_tokens": 10543520.0, + "step": 3760 + }, + { + "epoch": 9.060313630880579, + "eval_entropy": 0.254280548333452, + "eval_loss": 0.807111382484436, + "eval_mean_token_accuracy": 0.8463215992022096, + "eval_num_tokens": 10543520.0, + "eval_runtime": 55.1805, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3760 + }, + { + "entropy": 0.1408092312514782, + "epoch": 9.108564535585042, + "grad_norm": 0.43550431728363037, + "learning_rate": 5.726719025077231e-06, + "loss": 0.07781847715377807, + "mean_token_accuracy": 0.9768255725502968, + "num_tokens": 10596528.0, + "step": 3780 + }, + { + "epoch": 9.108564535585042, + "eval_entropy": 0.2505798229340757, + "eval_loss": 0.8224650025367737, + "eval_mean_token_accuracy": 0.8463811020502884, + "eval_num_tokens": 10596528.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3780 + }, + { + "entropy": 0.13897503707557918, + "epoch": 9.156815440289506, + "grad_norm": 0.5255228281021118, + "learning_rate": 5.130298001345343e-06, + "loss": 0.07528382539749146, + "mean_token_accuracy": 0.9768329098820686, + "num_tokens": 10655544.0, + "step": 3800 + }, + { + "epoch": 9.156815440289506, + "eval_entropy": 0.2509520294960965, + "eval_loss": 0.8247353434562683, + "eval_mean_token_accuracy": 0.845786559112956, + "eval_num_tokens": 10655544.0, + "eval_runtime": 55.151, + "eval_samples_per_second": 25.747, + "eval_steps_per_second": 3.228, + "step": 3800 + }, + { + "entropy": 0.13998530581593513, + "epoch": 9.205066344993968, + "grad_norm": 0.4610442817211151, + "learning_rate": 4.565980758469731e-06, + "loss": 0.07813523411750793, + "mean_token_accuracy": 0.9769201070070267, + "num_tokens": 10710737.0, + "step": 3820 + }, + { + "epoch": 9.205066344993968, + "eval_entropy": 0.2503350620691696, + "eval_loss": 0.8271610736846924, + "eval_mean_token_accuracy": 0.8455204879969693, + "eval_num_tokens": 10710737.0, + "eval_runtime": 55.1706, + "eval_samples_per_second": 25.738, + "eval_steps_per_second": 3.226, + "step": 3820 + }, + { + "entropy": 0.13162653651088477, + "epoch": 9.253317249698432, + "grad_norm": 0.3627360463142395, + "learning_rate": 4.033926991554922e-06, + "loss": 0.07141604423522949, + "mean_token_accuracy": 0.9792275875806808, + "num_tokens": 10771727.0, + "step": 3840 + }, + { + "epoch": 9.253317249698432, + "eval_entropy": 0.24897396087311627, + "eval_loss": 0.8307036757469177, + "eval_mean_token_accuracy": 0.8453606835911783, + "eval_num_tokens": 10771727.0, + "eval_runtime": 55.168, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3840 + }, + { + "entropy": 0.14422765467315912, + "epoch": 9.301568154402895, + "grad_norm": 0.5706251263618469, + "learning_rate": 3.53428726552335e-06, + "loss": 0.07931464314460754, + "mean_token_accuracy": 0.9752222061157226, + "num_tokens": 10825849.0, + "step": 3860 + }, + { + "epoch": 9.301568154402895, + "eval_entropy": 0.24914598389622872, + "eval_loss": 0.8312752842903137, + "eval_mean_token_accuracy": 0.8455510604917333, + "eval_num_tokens": 10825849.0, + "eval_runtime": 55.1849, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3860 + }, + { + "entropy": 0.13540438804775476, + "epoch": 9.349819059107359, + "grad_norm": 0.6334630250930786, + "learning_rate": 3.0672029725073196e-06, + "loss": 0.07548695802688599, + "mean_token_accuracy": 0.9778543844819069, + "num_tokens": 10884611.0, + "step": 3880 + }, + { + "epoch": 9.349819059107359, + "eval_entropy": 0.24910795722114906, + "eval_loss": 0.8316059112548828, + "eval_mean_token_accuracy": 0.8456973557391864, + "eval_num_tokens": 10884611.0, + "eval_runtime": 55.1728, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3880 + }, + { + "entropy": 0.1410717975348234, + "epoch": 9.398069963811821, + "grad_norm": 0.5009909868240356, + "learning_rate": 2.632806291836666e-06, + "loss": 0.07720760703086853, + "mean_token_accuracy": 0.9768648758530617, + "num_tokens": 10938300.0, + "step": 3900 + }, + { + "epoch": 9.398069963811821, + "eval_entropy": 0.24874219742048992, + "eval_loss": 0.8328408598899841, + "eval_mean_token_accuracy": 0.8455627888775943, + "eval_num_tokens": 10938300.0, + "eval_runtime": 55.1736, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3900 + }, + { + "entropy": 0.13674762714654207, + "epoch": 9.446320868516285, + "grad_norm": 0.46003177762031555, + "learning_rate": 2.231220152633621e-06, + "loss": 0.07583877444267273, + "mean_token_accuracy": 0.9772356480360032, + "num_tokens": 10994442.0, + "step": 3920 + }, + { + "epoch": 9.446320868516285, + "eval_entropy": 0.24896439030933915, + "eval_loss": 0.8331801891326904, + "eval_mean_token_accuracy": 0.8453999262177543, + "eval_num_tokens": 10994442.0, + "eval_runtime": 55.1815, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3920 + }, + { + "entropy": 0.1378044320270419, + "epoch": 9.494571773220748, + "grad_norm": 0.5056300759315491, + "learning_rate": 1.862558199025263e-06, + "loss": 0.07420622110366822, + "mean_token_accuracy": 0.9771117404103279, + "num_tokens": 11052708.0, + "step": 3940 + }, + { + "epoch": 9.494571773220748, + "eval_entropy": 0.2490867761413703, + "eval_loss": 0.8338391780853271, + "eval_mean_token_accuracy": 0.8453461571355884, + "eval_num_tokens": 11052708.0, + "eval_runtime": 59.1613, + "eval_samples_per_second": 24.002, + "eval_steps_per_second": 3.009, + "step": 3940 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.9500278220032e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..99d4cbc6cecf2fae622bcc7c3e55e4ce3c326cae --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/trainer_state.json @@ -0,0 +1,4192 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.54282267792521, + "eval_steps": 20, + "global_step": 3960, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + }, + { + "entropy": 0.15672272052615882, + "epoch": 8.916767189384801, + "grad_norm": 0.610146701335907, + "learning_rate": 8.429930072725457e-06, + "loss": 0.09335047006607056, + "mean_token_accuracy": 0.9709506019949913, + "num_tokens": 10377086.0, + "step": 3700 + }, + { + "epoch": 8.916767189384801, + "eval_entropy": 0.26106021611878044, + "eval_loss": 0.7824530005455017, + "eval_mean_token_accuracy": 0.8467274777005228, + "eval_num_tokens": 10377086.0, + "eval_runtime": 55.1677, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3700 + }, + { + "entropy": 0.15496804118156432, + "epoch": 8.965018094089265, + "grad_norm": 0.8847843408584595, + "learning_rate": 7.706871844646178e-06, + "loss": 0.09552072882652282, + "mean_token_accuracy": 0.9693930178880692, + "num_tokens": 10435272.0, + "step": 3720 + }, + { + "epoch": 8.965018094089265, + "eval_entropy": 0.26175538701622675, + "eval_loss": 0.7833470106124878, + "eval_mean_token_accuracy": 0.8464220507761065, + "eval_num_tokens": 10435272.0, + "eval_runtime": 55.1496, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3720 + }, + { + "entropy": 0.15739625711471605, + "epoch": 9.012062726176115, + "grad_norm": 0.35802221298217773, + "learning_rate": 7.0151882575034775e-06, + "loss": 0.09389110803604125, + "mean_token_accuracy": 0.9715636464265677, + "num_tokens": 10487076.0, + "step": 3740 + }, + { + "epoch": 9.012062726176115, + "eval_entropy": 0.26084648198291155, + "eval_loss": 0.7853291034698486, + "eval_mean_token_accuracy": 0.8463665585169632, + "eval_num_tokens": 10487076.0, + "eval_runtime": 55.1572, + "eval_samples_per_second": 25.745, + "eval_steps_per_second": 3.227, + "step": 3740 + }, + { + "entropy": 0.14215838070958853, + "epoch": 9.060313630880579, + "grad_norm": 0.4684004783630371, + "learning_rate": 6.35507504957069e-06, + "loss": 0.07583575248718262, + "mean_token_accuracy": 0.9782516390085221, + "num_tokens": 10543520.0, + "step": 3760 + }, + { + "epoch": 9.060313630880579, + "eval_entropy": 0.254280548333452, + "eval_loss": 0.807111382484436, + "eval_mean_token_accuracy": 0.8463215992022096, + "eval_num_tokens": 10543520.0, + "eval_runtime": 55.1805, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3760 + }, + { + "entropy": 0.1408092312514782, + "epoch": 9.108564535585042, + "grad_norm": 0.43550431728363037, + "learning_rate": 5.726719025077231e-06, + "loss": 0.07781847715377807, + "mean_token_accuracy": 0.9768255725502968, + "num_tokens": 10596528.0, + "step": 3780 + }, + { + "epoch": 9.108564535585042, + "eval_entropy": 0.2505798229340757, + "eval_loss": 0.8224650025367737, + "eval_mean_token_accuracy": 0.8463811020502884, + "eval_num_tokens": 10596528.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3780 + }, + { + "entropy": 0.13897503707557918, + "epoch": 9.156815440289506, + "grad_norm": 0.5255228281021118, + "learning_rate": 5.130298001345343e-06, + "loss": 0.07528382539749146, + "mean_token_accuracy": 0.9768329098820686, + "num_tokens": 10655544.0, + "step": 3800 + }, + { + "epoch": 9.156815440289506, + "eval_entropy": 0.2509520294960965, + "eval_loss": 0.8247353434562683, + "eval_mean_token_accuracy": 0.845786559112956, + "eval_num_tokens": 10655544.0, + "eval_runtime": 55.151, + "eval_samples_per_second": 25.747, + "eval_steps_per_second": 3.228, + "step": 3800 + }, + { + "entropy": 0.13998530581593513, + "epoch": 9.205066344993968, + "grad_norm": 0.4610442817211151, + "learning_rate": 4.565980758469731e-06, + "loss": 0.07813523411750793, + "mean_token_accuracy": 0.9769201070070267, + "num_tokens": 10710737.0, + "step": 3820 + }, + { + "epoch": 9.205066344993968, + "eval_entropy": 0.2503350620691696, + "eval_loss": 0.8271610736846924, + "eval_mean_token_accuracy": 0.8455204879969693, + "eval_num_tokens": 10710737.0, + "eval_runtime": 55.1706, + "eval_samples_per_second": 25.738, + "eval_steps_per_second": 3.226, + "step": 3820 + }, + { + "entropy": 0.13162653651088477, + "epoch": 9.253317249698432, + "grad_norm": 0.3627360463142395, + "learning_rate": 4.033926991554922e-06, + "loss": 0.07141604423522949, + "mean_token_accuracy": 0.9792275875806808, + "num_tokens": 10771727.0, + "step": 3840 + }, + { + "epoch": 9.253317249698432, + "eval_entropy": 0.24897396087311627, + "eval_loss": 0.8307036757469177, + "eval_mean_token_accuracy": 0.8453606835911783, + "eval_num_tokens": 10771727.0, + "eval_runtime": 55.168, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3840 + }, + { + "entropy": 0.14422765467315912, + "epoch": 9.301568154402895, + "grad_norm": 0.5706251263618469, + "learning_rate": 3.53428726552335e-06, + "loss": 0.07931464314460754, + "mean_token_accuracy": 0.9752222061157226, + "num_tokens": 10825849.0, + "step": 3860 + }, + { + "epoch": 9.301568154402895, + "eval_entropy": 0.24914598389622872, + "eval_loss": 0.8312752842903137, + "eval_mean_token_accuracy": 0.8455510604917333, + "eval_num_tokens": 10825849.0, + "eval_runtime": 55.1849, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3860 + }, + { + "entropy": 0.13540438804775476, + "epoch": 9.349819059107359, + "grad_norm": 0.6334630250930786, + "learning_rate": 3.0672029725073196e-06, + "loss": 0.07548695802688599, + "mean_token_accuracy": 0.9778543844819069, + "num_tokens": 10884611.0, + "step": 3880 + }, + { + "epoch": 9.349819059107359, + "eval_entropy": 0.24910795722114906, + "eval_loss": 0.8316059112548828, + "eval_mean_token_accuracy": 0.8456973557391864, + "eval_num_tokens": 10884611.0, + "eval_runtime": 55.1728, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3880 + }, + { + "entropy": 0.1410717975348234, + "epoch": 9.398069963811821, + "grad_norm": 0.5009909868240356, + "learning_rate": 2.632806291836666e-06, + "loss": 0.07720760703086853, + "mean_token_accuracy": 0.9768648758530617, + "num_tokens": 10938300.0, + "step": 3900 + }, + { + "epoch": 9.398069963811821, + "eval_entropy": 0.24874219742048992, + "eval_loss": 0.8328408598899841, + "eval_mean_token_accuracy": 0.8455627888775943, + "eval_num_tokens": 10938300.0, + "eval_runtime": 55.1736, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3900 + }, + { + "entropy": 0.13674762714654207, + "epoch": 9.446320868516285, + "grad_norm": 0.46003177762031555, + "learning_rate": 2.231220152633621e-06, + "loss": 0.07583877444267273, + "mean_token_accuracy": 0.9772356480360032, + "num_tokens": 10994442.0, + "step": 3920 + }, + { + "epoch": 9.446320868516285, + "eval_entropy": 0.24896439030933915, + "eval_loss": 0.8331801891326904, + "eval_mean_token_accuracy": 0.8453999262177543, + "eval_num_tokens": 10994442.0, + "eval_runtime": 55.1815, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3920 + }, + { + "entropy": 0.1378044320270419, + "epoch": 9.494571773220748, + "grad_norm": 0.5056300759315491, + "learning_rate": 1.862558199025263e-06, + "loss": 0.07420622110366822, + "mean_token_accuracy": 0.9771117404103279, + "num_tokens": 11052708.0, + "step": 3940 + }, + { + "epoch": 9.494571773220748, + "eval_entropy": 0.2490867761413703, + "eval_loss": 0.8338391780853271, + "eval_mean_token_accuracy": 0.8453461571355884, + "eval_num_tokens": 11052708.0, + "eval_runtime": 59.1613, + "eval_samples_per_second": 24.002, + "eval_steps_per_second": 3.009, + "step": 3940 + }, + { + "entropy": 0.13731490727514029, + "epoch": 9.54282267792521, + "grad_norm": 0.5770965218544006, + "learning_rate": 1.5269247579836162e-06, + "loss": 0.07547505497932434, + "mean_token_accuracy": 0.9772329092025757, + "num_tokens": 11106761.0, + "step": 3960 + }, + { + "epoch": 9.54282267792521, + "eval_entropy": 0.24866497918461147, + "eval_loss": 0.8349169492721558, + "eval_mean_token_accuracy": 0.8455062398080075, + "eval_num_tokens": 11106761.0, + "eval_runtime": 55.1484, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3960 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.977695841862246e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..70d6c02f8ec86b19c045682b96ab6f6e0590d754 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/trainer_state.json @@ -0,0 +1,4213 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.591073582629674, + "eval_steps": 20, + "global_step": 3980, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + }, + { + "entropy": 0.15672272052615882, + "epoch": 8.916767189384801, + "grad_norm": 0.610146701335907, + "learning_rate": 8.429930072725457e-06, + "loss": 0.09335047006607056, + "mean_token_accuracy": 0.9709506019949913, + "num_tokens": 10377086.0, + "step": 3700 + }, + { + "epoch": 8.916767189384801, + "eval_entropy": 0.26106021611878044, + "eval_loss": 0.7824530005455017, + "eval_mean_token_accuracy": 0.8467274777005228, + "eval_num_tokens": 10377086.0, + "eval_runtime": 55.1677, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3700 + }, + { + "entropy": 0.15496804118156432, + "epoch": 8.965018094089265, + "grad_norm": 0.8847843408584595, + "learning_rate": 7.706871844646178e-06, + "loss": 0.09552072882652282, + "mean_token_accuracy": 0.9693930178880692, + "num_tokens": 10435272.0, + "step": 3720 + }, + { + "epoch": 8.965018094089265, + "eval_entropy": 0.26175538701622675, + "eval_loss": 0.7833470106124878, + "eval_mean_token_accuracy": 0.8464220507761065, + "eval_num_tokens": 10435272.0, + "eval_runtime": 55.1496, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3720 + }, + { + "entropy": 0.15739625711471605, + "epoch": 9.012062726176115, + "grad_norm": 0.35802221298217773, + "learning_rate": 7.0151882575034775e-06, + "loss": 0.09389110803604125, + "mean_token_accuracy": 0.9715636464265677, + "num_tokens": 10487076.0, + "step": 3740 + }, + { + "epoch": 9.012062726176115, + "eval_entropy": 0.26084648198291155, + "eval_loss": 0.7853291034698486, + "eval_mean_token_accuracy": 0.8463665585169632, + "eval_num_tokens": 10487076.0, + "eval_runtime": 55.1572, + "eval_samples_per_second": 25.745, + "eval_steps_per_second": 3.227, + "step": 3740 + }, + { + "entropy": 0.14215838070958853, + "epoch": 9.060313630880579, + "grad_norm": 0.4684004783630371, + "learning_rate": 6.35507504957069e-06, + "loss": 0.07583575248718262, + "mean_token_accuracy": 0.9782516390085221, + "num_tokens": 10543520.0, + "step": 3760 + }, + { + "epoch": 9.060313630880579, + "eval_entropy": 0.254280548333452, + "eval_loss": 0.807111382484436, + "eval_mean_token_accuracy": 0.8463215992022096, + "eval_num_tokens": 10543520.0, + "eval_runtime": 55.1805, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3760 + }, + { + "entropy": 0.1408092312514782, + "epoch": 9.108564535585042, + "grad_norm": 0.43550431728363037, + "learning_rate": 5.726719025077231e-06, + "loss": 0.07781847715377807, + "mean_token_accuracy": 0.9768255725502968, + "num_tokens": 10596528.0, + "step": 3780 + }, + { + "epoch": 9.108564535585042, + "eval_entropy": 0.2505798229340757, + "eval_loss": 0.8224650025367737, + "eval_mean_token_accuracy": 0.8463811020502884, + "eval_num_tokens": 10596528.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3780 + }, + { + "entropy": 0.13897503707557918, + "epoch": 9.156815440289506, + "grad_norm": 0.5255228281021118, + "learning_rate": 5.130298001345343e-06, + "loss": 0.07528382539749146, + "mean_token_accuracy": 0.9768329098820686, + "num_tokens": 10655544.0, + "step": 3800 + }, + { + "epoch": 9.156815440289506, + "eval_entropy": 0.2509520294960965, + "eval_loss": 0.8247353434562683, + "eval_mean_token_accuracy": 0.845786559112956, + "eval_num_tokens": 10655544.0, + "eval_runtime": 55.151, + "eval_samples_per_second": 25.747, + "eval_steps_per_second": 3.228, + "step": 3800 + }, + { + "entropy": 0.13998530581593513, + "epoch": 9.205066344993968, + "grad_norm": 0.4610442817211151, + "learning_rate": 4.565980758469731e-06, + "loss": 0.07813523411750793, + "mean_token_accuracy": 0.9769201070070267, + "num_tokens": 10710737.0, + "step": 3820 + }, + { + "epoch": 9.205066344993968, + "eval_entropy": 0.2503350620691696, + "eval_loss": 0.8271610736846924, + "eval_mean_token_accuracy": 0.8455204879969693, + "eval_num_tokens": 10710737.0, + "eval_runtime": 55.1706, + "eval_samples_per_second": 25.738, + "eval_steps_per_second": 3.226, + "step": 3820 + }, + { + "entropy": 0.13162653651088477, + "epoch": 9.253317249698432, + "grad_norm": 0.3627360463142395, + "learning_rate": 4.033926991554922e-06, + "loss": 0.07141604423522949, + "mean_token_accuracy": 0.9792275875806808, + "num_tokens": 10771727.0, + "step": 3840 + }, + { + "epoch": 9.253317249698432, + "eval_entropy": 0.24897396087311627, + "eval_loss": 0.8307036757469177, + "eval_mean_token_accuracy": 0.8453606835911783, + "eval_num_tokens": 10771727.0, + "eval_runtime": 55.168, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3840 + }, + { + "entropy": 0.14422765467315912, + "epoch": 9.301568154402895, + "grad_norm": 0.5706251263618469, + "learning_rate": 3.53428726552335e-06, + "loss": 0.07931464314460754, + "mean_token_accuracy": 0.9752222061157226, + "num_tokens": 10825849.0, + "step": 3860 + }, + { + "epoch": 9.301568154402895, + "eval_entropy": 0.24914598389622872, + "eval_loss": 0.8312752842903137, + "eval_mean_token_accuracy": 0.8455510604917333, + "eval_num_tokens": 10825849.0, + "eval_runtime": 55.1849, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3860 + }, + { + "entropy": 0.13540438804775476, + "epoch": 9.349819059107359, + "grad_norm": 0.6334630250930786, + "learning_rate": 3.0672029725073196e-06, + "loss": 0.07548695802688599, + "mean_token_accuracy": 0.9778543844819069, + "num_tokens": 10884611.0, + "step": 3880 + }, + { + "epoch": 9.349819059107359, + "eval_entropy": 0.24910795722114906, + "eval_loss": 0.8316059112548828, + "eval_mean_token_accuracy": 0.8456973557391864, + "eval_num_tokens": 10884611.0, + "eval_runtime": 55.1728, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3880 + }, + { + "entropy": 0.1410717975348234, + "epoch": 9.398069963811821, + "grad_norm": 0.5009909868240356, + "learning_rate": 2.632806291836666e-06, + "loss": 0.07720760703086853, + "mean_token_accuracy": 0.9768648758530617, + "num_tokens": 10938300.0, + "step": 3900 + }, + { + "epoch": 9.398069963811821, + "eval_entropy": 0.24874219742048992, + "eval_loss": 0.8328408598899841, + "eval_mean_token_accuracy": 0.8455627888775943, + "eval_num_tokens": 10938300.0, + "eval_runtime": 55.1736, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3900 + }, + { + "entropy": 0.13674762714654207, + "epoch": 9.446320868516285, + "grad_norm": 0.46003177762031555, + "learning_rate": 2.231220152633621e-06, + "loss": 0.07583877444267273, + "mean_token_accuracy": 0.9772356480360032, + "num_tokens": 10994442.0, + "step": 3920 + }, + { + "epoch": 9.446320868516285, + "eval_entropy": 0.24896439030933915, + "eval_loss": 0.8331801891326904, + "eval_mean_token_accuracy": 0.8453999262177543, + "eval_num_tokens": 10994442.0, + "eval_runtime": 55.1815, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3920 + }, + { + "entropy": 0.1378044320270419, + "epoch": 9.494571773220748, + "grad_norm": 0.5056300759315491, + "learning_rate": 1.862558199025263e-06, + "loss": 0.07420622110366822, + "mean_token_accuracy": 0.9771117404103279, + "num_tokens": 11052708.0, + "step": 3940 + }, + { + "epoch": 9.494571773220748, + "eval_entropy": 0.2490867761413703, + "eval_loss": 0.8338391780853271, + "eval_mean_token_accuracy": 0.8453461571355884, + "eval_num_tokens": 11052708.0, + "eval_runtime": 59.1613, + "eval_samples_per_second": 24.002, + "eval_steps_per_second": 3.009, + "step": 3940 + }, + { + "entropy": 0.13731490727514029, + "epoch": 9.54282267792521, + "grad_norm": 0.5770965218544006, + "learning_rate": 1.5269247579836162e-06, + "loss": 0.07547505497932434, + "mean_token_accuracy": 0.9772329092025757, + "num_tokens": 11106761.0, + "step": 3960 + }, + { + "epoch": 9.54282267792521, + "eval_entropy": 0.24866497918461147, + "eval_loss": 0.8349169492721558, + "eval_mean_token_accuracy": 0.8455062398080075, + "eval_num_tokens": 11106761.0, + "eval_runtime": 55.1484, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3960 + }, + { + "entropy": 0.13437952492386102, + "epoch": 9.591073582629674, + "grad_norm": 0.5746839046478271, + "learning_rate": 1.2244148098023241e-06, + "loss": 0.0719214141368866, + "mean_token_accuracy": 0.9783813208341599, + "num_tokens": 11163241.0, + "step": 3980 + }, + { + "epoch": 9.591073582629674, + "eval_entropy": 0.2487325757909357, + "eval_loss": 0.8345232009887695, + "eval_mean_token_accuracy": 0.8452854909923639, + "eval_num_tokens": 11163241.0, + "eval_runtime": 55.2029, + "eval_samples_per_second": 25.723, + "eval_steps_per_second": 3.224, + "step": 3980 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.00520194722304e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..75455a63f2ec348fc0447419af314dd24bc30976 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/trainer_state.json @@ -0,0 +1,76 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.09650180940892641, + "eval_steps": 20, + "global_step": 40, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5990484447313920.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7200eb556657006c72b25c612af74af9a02996b5 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/trainer_state.json @@ -0,0 +1,454 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9650180940892642, + "eval_steps": 20, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.083550085005312e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c7f9405d9329cf7a8d5c17b1e530b6a335b50ef2 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/trainer_state.json @@ -0,0 +1,4234 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.639324487334138, + "eval_steps": 20, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + }, + { + "entropy": 0.15672272052615882, + "epoch": 8.916767189384801, + "grad_norm": 0.610146701335907, + "learning_rate": 8.429930072725457e-06, + "loss": 0.09335047006607056, + "mean_token_accuracy": 0.9709506019949913, + "num_tokens": 10377086.0, + "step": 3700 + }, + { + "epoch": 8.916767189384801, + "eval_entropy": 0.26106021611878044, + "eval_loss": 0.7824530005455017, + "eval_mean_token_accuracy": 0.8467274777005228, + "eval_num_tokens": 10377086.0, + "eval_runtime": 55.1677, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3700 + }, + { + "entropy": 0.15496804118156432, + "epoch": 8.965018094089265, + "grad_norm": 0.8847843408584595, + "learning_rate": 7.706871844646178e-06, + "loss": 0.09552072882652282, + "mean_token_accuracy": 0.9693930178880692, + "num_tokens": 10435272.0, + "step": 3720 + }, + { + "epoch": 8.965018094089265, + "eval_entropy": 0.26175538701622675, + "eval_loss": 0.7833470106124878, + "eval_mean_token_accuracy": 0.8464220507761065, + "eval_num_tokens": 10435272.0, + "eval_runtime": 55.1496, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3720 + }, + { + "entropy": 0.15739625711471605, + "epoch": 9.012062726176115, + "grad_norm": 0.35802221298217773, + "learning_rate": 7.0151882575034775e-06, + "loss": 0.09389110803604125, + "mean_token_accuracy": 0.9715636464265677, + "num_tokens": 10487076.0, + "step": 3740 + }, + { + "epoch": 9.012062726176115, + "eval_entropy": 0.26084648198291155, + "eval_loss": 0.7853291034698486, + "eval_mean_token_accuracy": 0.8463665585169632, + "eval_num_tokens": 10487076.0, + "eval_runtime": 55.1572, + "eval_samples_per_second": 25.745, + "eval_steps_per_second": 3.227, + "step": 3740 + }, + { + "entropy": 0.14215838070958853, + "epoch": 9.060313630880579, + "grad_norm": 0.4684004783630371, + "learning_rate": 6.35507504957069e-06, + "loss": 0.07583575248718262, + "mean_token_accuracy": 0.9782516390085221, + "num_tokens": 10543520.0, + "step": 3760 + }, + { + "epoch": 9.060313630880579, + "eval_entropy": 0.254280548333452, + "eval_loss": 0.807111382484436, + "eval_mean_token_accuracy": 0.8463215992022096, + "eval_num_tokens": 10543520.0, + "eval_runtime": 55.1805, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3760 + }, + { + "entropy": 0.1408092312514782, + "epoch": 9.108564535585042, + "grad_norm": 0.43550431728363037, + "learning_rate": 5.726719025077231e-06, + "loss": 0.07781847715377807, + "mean_token_accuracy": 0.9768255725502968, + "num_tokens": 10596528.0, + "step": 3780 + }, + { + "epoch": 9.108564535585042, + "eval_entropy": 0.2505798229340757, + "eval_loss": 0.8224650025367737, + "eval_mean_token_accuracy": 0.8463811020502884, + "eval_num_tokens": 10596528.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3780 + }, + { + "entropy": 0.13897503707557918, + "epoch": 9.156815440289506, + "grad_norm": 0.5255228281021118, + "learning_rate": 5.130298001345343e-06, + "loss": 0.07528382539749146, + "mean_token_accuracy": 0.9768329098820686, + "num_tokens": 10655544.0, + "step": 3800 + }, + { + "epoch": 9.156815440289506, + "eval_entropy": 0.2509520294960965, + "eval_loss": 0.8247353434562683, + "eval_mean_token_accuracy": 0.845786559112956, + "eval_num_tokens": 10655544.0, + "eval_runtime": 55.151, + "eval_samples_per_second": 25.747, + "eval_steps_per_second": 3.228, + "step": 3800 + }, + { + "entropy": 0.13998530581593513, + "epoch": 9.205066344993968, + "grad_norm": 0.4610442817211151, + "learning_rate": 4.565980758469731e-06, + "loss": 0.07813523411750793, + "mean_token_accuracy": 0.9769201070070267, + "num_tokens": 10710737.0, + "step": 3820 + }, + { + "epoch": 9.205066344993968, + "eval_entropy": 0.2503350620691696, + "eval_loss": 0.8271610736846924, + "eval_mean_token_accuracy": 0.8455204879969693, + "eval_num_tokens": 10710737.0, + "eval_runtime": 55.1706, + "eval_samples_per_second": 25.738, + "eval_steps_per_second": 3.226, + "step": 3820 + }, + { + "entropy": 0.13162653651088477, + "epoch": 9.253317249698432, + "grad_norm": 0.3627360463142395, + "learning_rate": 4.033926991554922e-06, + "loss": 0.07141604423522949, + "mean_token_accuracy": 0.9792275875806808, + "num_tokens": 10771727.0, + "step": 3840 + }, + { + "epoch": 9.253317249698432, + "eval_entropy": 0.24897396087311627, + "eval_loss": 0.8307036757469177, + "eval_mean_token_accuracy": 0.8453606835911783, + "eval_num_tokens": 10771727.0, + "eval_runtime": 55.168, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3840 + }, + { + "entropy": 0.14422765467315912, + "epoch": 9.301568154402895, + "grad_norm": 0.5706251263618469, + "learning_rate": 3.53428726552335e-06, + "loss": 0.07931464314460754, + "mean_token_accuracy": 0.9752222061157226, + "num_tokens": 10825849.0, + "step": 3860 + }, + { + "epoch": 9.301568154402895, + "eval_entropy": 0.24914598389622872, + "eval_loss": 0.8312752842903137, + "eval_mean_token_accuracy": 0.8455510604917333, + "eval_num_tokens": 10825849.0, + "eval_runtime": 55.1849, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3860 + }, + { + "entropy": 0.13540438804775476, + "epoch": 9.349819059107359, + "grad_norm": 0.6334630250930786, + "learning_rate": 3.0672029725073196e-06, + "loss": 0.07548695802688599, + "mean_token_accuracy": 0.9778543844819069, + "num_tokens": 10884611.0, + "step": 3880 + }, + { + "epoch": 9.349819059107359, + "eval_entropy": 0.24910795722114906, + "eval_loss": 0.8316059112548828, + "eval_mean_token_accuracy": 0.8456973557391864, + "eval_num_tokens": 10884611.0, + "eval_runtime": 55.1728, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3880 + }, + { + "entropy": 0.1410717975348234, + "epoch": 9.398069963811821, + "grad_norm": 0.5009909868240356, + "learning_rate": 2.632806291836666e-06, + "loss": 0.07720760703086853, + "mean_token_accuracy": 0.9768648758530617, + "num_tokens": 10938300.0, + "step": 3900 + }, + { + "epoch": 9.398069963811821, + "eval_entropy": 0.24874219742048992, + "eval_loss": 0.8328408598899841, + "eval_mean_token_accuracy": 0.8455627888775943, + "eval_num_tokens": 10938300.0, + "eval_runtime": 55.1736, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3900 + }, + { + "entropy": 0.13674762714654207, + "epoch": 9.446320868516285, + "grad_norm": 0.46003177762031555, + "learning_rate": 2.231220152633621e-06, + "loss": 0.07583877444267273, + "mean_token_accuracy": 0.9772356480360032, + "num_tokens": 10994442.0, + "step": 3920 + }, + { + "epoch": 9.446320868516285, + "eval_entropy": 0.24896439030933915, + "eval_loss": 0.8331801891326904, + "eval_mean_token_accuracy": 0.8453999262177543, + "eval_num_tokens": 10994442.0, + "eval_runtime": 55.1815, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3920 + }, + { + "entropy": 0.1378044320270419, + "epoch": 9.494571773220748, + "grad_norm": 0.5056300759315491, + "learning_rate": 1.862558199025263e-06, + "loss": 0.07420622110366822, + "mean_token_accuracy": 0.9771117404103279, + "num_tokens": 11052708.0, + "step": 3940 + }, + { + "epoch": 9.494571773220748, + "eval_entropy": 0.2490867761413703, + "eval_loss": 0.8338391780853271, + "eval_mean_token_accuracy": 0.8453461571355884, + "eval_num_tokens": 11052708.0, + "eval_runtime": 59.1613, + "eval_samples_per_second": 24.002, + "eval_steps_per_second": 3.009, + "step": 3940 + }, + { + "entropy": 0.13731490727514029, + "epoch": 9.54282267792521, + "grad_norm": 0.5770965218544006, + "learning_rate": 1.5269247579836162e-06, + "loss": 0.07547505497932434, + "mean_token_accuracy": 0.9772329092025757, + "num_tokens": 11106761.0, + "step": 3960 + }, + { + "epoch": 9.54282267792521, + "eval_entropy": 0.24866497918461147, + "eval_loss": 0.8349169492721558, + "eval_mean_token_accuracy": 0.8455062398080075, + "eval_num_tokens": 11106761.0, + "eval_runtime": 55.1484, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3960 + }, + { + "entropy": 0.13437952492386102, + "epoch": 9.591073582629674, + "grad_norm": 0.5746839046478271, + "learning_rate": 1.2244148098023241e-06, + "loss": 0.0719214141368866, + "mean_token_accuracy": 0.9783813208341599, + "num_tokens": 11163241.0, + "step": 3980 + }, + { + "epoch": 9.591073582629674, + "eval_entropy": 0.2487325757909357, + "eval_loss": 0.8345232009887695, + "eval_mean_token_accuracy": 0.8452854909923639, + "eval_num_tokens": 11163241.0, + "eval_runtime": 55.2029, + "eval_samples_per_second": 25.723, + "eval_steps_per_second": 3.224, + "step": 3980 + }, + { + "entropy": 0.1449177075177431, + "epoch": 9.639324487334138, + "grad_norm": 0.7029407024383545, + "learning_rate": 9.551139612183896e-07, + "loss": 0.08021060228347779, + "mean_token_accuracy": 0.9748102590441704, + "num_tokens": 11215311.0, + "step": 4000 + }, + { + "epoch": 9.639324487334138, + "eval_entropy": 0.24849342898036655, + "eval_loss": 0.8350681066513062, + "eval_mean_token_accuracy": 0.8454755872822879, + "eval_num_tokens": 11215311.0, + "eval_runtime": 55.1818, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 4000 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.033403228962202e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4984a4eca1204d09b3588e203ae5513197325724 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/trainer_state.json @@ -0,0 +1,4255 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.687575392038601, + "eval_steps": 20, + "global_step": 4020, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + }, + { + "entropy": 0.15672272052615882, + "epoch": 8.916767189384801, + "grad_norm": 0.610146701335907, + "learning_rate": 8.429930072725457e-06, + "loss": 0.09335047006607056, + "mean_token_accuracy": 0.9709506019949913, + "num_tokens": 10377086.0, + "step": 3700 + }, + { + "epoch": 8.916767189384801, + "eval_entropy": 0.26106021611878044, + "eval_loss": 0.7824530005455017, + "eval_mean_token_accuracy": 0.8467274777005228, + "eval_num_tokens": 10377086.0, + "eval_runtime": 55.1677, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3700 + }, + { + "entropy": 0.15496804118156432, + "epoch": 8.965018094089265, + "grad_norm": 0.8847843408584595, + "learning_rate": 7.706871844646178e-06, + "loss": 0.09552072882652282, + "mean_token_accuracy": 0.9693930178880692, + "num_tokens": 10435272.0, + "step": 3720 + }, + { + "epoch": 8.965018094089265, + "eval_entropy": 0.26175538701622675, + "eval_loss": 0.7833470106124878, + "eval_mean_token_accuracy": 0.8464220507761065, + "eval_num_tokens": 10435272.0, + "eval_runtime": 55.1496, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3720 + }, + { + "entropy": 0.15739625711471605, + "epoch": 9.012062726176115, + "grad_norm": 0.35802221298217773, + "learning_rate": 7.0151882575034775e-06, + "loss": 0.09389110803604125, + "mean_token_accuracy": 0.9715636464265677, + "num_tokens": 10487076.0, + "step": 3740 + }, + { + "epoch": 9.012062726176115, + "eval_entropy": 0.26084648198291155, + "eval_loss": 0.7853291034698486, + "eval_mean_token_accuracy": 0.8463665585169632, + "eval_num_tokens": 10487076.0, + "eval_runtime": 55.1572, + "eval_samples_per_second": 25.745, + "eval_steps_per_second": 3.227, + "step": 3740 + }, + { + "entropy": 0.14215838070958853, + "epoch": 9.060313630880579, + "grad_norm": 0.4684004783630371, + "learning_rate": 6.35507504957069e-06, + "loss": 0.07583575248718262, + "mean_token_accuracy": 0.9782516390085221, + "num_tokens": 10543520.0, + "step": 3760 + }, + { + "epoch": 9.060313630880579, + "eval_entropy": 0.254280548333452, + "eval_loss": 0.807111382484436, + "eval_mean_token_accuracy": 0.8463215992022096, + "eval_num_tokens": 10543520.0, + "eval_runtime": 55.1805, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3760 + }, + { + "entropy": 0.1408092312514782, + "epoch": 9.108564535585042, + "grad_norm": 0.43550431728363037, + "learning_rate": 5.726719025077231e-06, + "loss": 0.07781847715377807, + "mean_token_accuracy": 0.9768255725502968, + "num_tokens": 10596528.0, + "step": 3780 + }, + { + "epoch": 9.108564535585042, + "eval_entropy": 0.2505798229340757, + "eval_loss": 0.8224650025367737, + "eval_mean_token_accuracy": 0.8463811020502884, + "eval_num_tokens": 10596528.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3780 + }, + { + "entropy": 0.13897503707557918, + "epoch": 9.156815440289506, + "grad_norm": 0.5255228281021118, + "learning_rate": 5.130298001345343e-06, + "loss": 0.07528382539749146, + "mean_token_accuracy": 0.9768329098820686, + "num_tokens": 10655544.0, + "step": 3800 + }, + { + "epoch": 9.156815440289506, + "eval_entropy": 0.2509520294960965, + "eval_loss": 0.8247353434562683, + "eval_mean_token_accuracy": 0.845786559112956, + "eval_num_tokens": 10655544.0, + "eval_runtime": 55.151, + "eval_samples_per_second": 25.747, + "eval_steps_per_second": 3.228, + "step": 3800 + }, + { + "entropy": 0.13998530581593513, + "epoch": 9.205066344993968, + "grad_norm": 0.4610442817211151, + "learning_rate": 4.565980758469731e-06, + "loss": 0.07813523411750793, + "mean_token_accuracy": 0.9769201070070267, + "num_tokens": 10710737.0, + "step": 3820 + }, + { + "epoch": 9.205066344993968, + "eval_entropy": 0.2503350620691696, + "eval_loss": 0.8271610736846924, + "eval_mean_token_accuracy": 0.8455204879969693, + "eval_num_tokens": 10710737.0, + "eval_runtime": 55.1706, + "eval_samples_per_second": 25.738, + "eval_steps_per_second": 3.226, + "step": 3820 + }, + { + "entropy": 0.13162653651088477, + "epoch": 9.253317249698432, + "grad_norm": 0.3627360463142395, + "learning_rate": 4.033926991554922e-06, + "loss": 0.07141604423522949, + "mean_token_accuracy": 0.9792275875806808, + "num_tokens": 10771727.0, + "step": 3840 + }, + { + "epoch": 9.253317249698432, + "eval_entropy": 0.24897396087311627, + "eval_loss": 0.8307036757469177, + "eval_mean_token_accuracy": 0.8453606835911783, + "eval_num_tokens": 10771727.0, + "eval_runtime": 55.168, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3840 + }, + { + "entropy": 0.14422765467315912, + "epoch": 9.301568154402895, + "grad_norm": 0.5706251263618469, + "learning_rate": 3.53428726552335e-06, + "loss": 0.07931464314460754, + "mean_token_accuracy": 0.9752222061157226, + "num_tokens": 10825849.0, + "step": 3860 + }, + { + "epoch": 9.301568154402895, + "eval_entropy": 0.24914598389622872, + "eval_loss": 0.8312752842903137, + "eval_mean_token_accuracy": 0.8455510604917333, + "eval_num_tokens": 10825849.0, + "eval_runtime": 55.1849, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3860 + }, + { + "entropy": 0.13540438804775476, + "epoch": 9.349819059107359, + "grad_norm": 0.6334630250930786, + "learning_rate": 3.0672029725073196e-06, + "loss": 0.07548695802688599, + "mean_token_accuracy": 0.9778543844819069, + "num_tokens": 10884611.0, + "step": 3880 + }, + { + "epoch": 9.349819059107359, + "eval_entropy": 0.24910795722114906, + "eval_loss": 0.8316059112548828, + "eval_mean_token_accuracy": 0.8456973557391864, + "eval_num_tokens": 10884611.0, + "eval_runtime": 55.1728, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3880 + }, + { + "entropy": 0.1410717975348234, + "epoch": 9.398069963811821, + "grad_norm": 0.5009909868240356, + "learning_rate": 2.632806291836666e-06, + "loss": 0.07720760703086853, + "mean_token_accuracy": 0.9768648758530617, + "num_tokens": 10938300.0, + "step": 3900 + }, + { + "epoch": 9.398069963811821, + "eval_entropy": 0.24874219742048992, + "eval_loss": 0.8328408598899841, + "eval_mean_token_accuracy": 0.8455627888775943, + "eval_num_tokens": 10938300.0, + "eval_runtime": 55.1736, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3900 + }, + { + "entropy": 0.13674762714654207, + "epoch": 9.446320868516285, + "grad_norm": 0.46003177762031555, + "learning_rate": 2.231220152633621e-06, + "loss": 0.07583877444267273, + "mean_token_accuracy": 0.9772356480360032, + "num_tokens": 10994442.0, + "step": 3920 + }, + { + "epoch": 9.446320868516285, + "eval_entropy": 0.24896439030933915, + "eval_loss": 0.8331801891326904, + "eval_mean_token_accuracy": 0.8453999262177543, + "eval_num_tokens": 10994442.0, + "eval_runtime": 55.1815, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3920 + }, + { + "entropy": 0.1378044320270419, + "epoch": 9.494571773220748, + "grad_norm": 0.5056300759315491, + "learning_rate": 1.862558199025263e-06, + "loss": 0.07420622110366822, + "mean_token_accuracy": 0.9771117404103279, + "num_tokens": 11052708.0, + "step": 3940 + }, + { + "epoch": 9.494571773220748, + "eval_entropy": 0.2490867761413703, + "eval_loss": 0.8338391780853271, + "eval_mean_token_accuracy": 0.8453461571355884, + "eval_num_tokens": 11052708.0, + "eval_runtime": 59.1613, + "eval_samples_per_second": 24.002, + "eval_steps_per_second": 3.009, + "step": 3940 + }, + { + "entropy": 0.13731490727514029, + "epoch": 9.54282267792521, + "grad_norm": 0.5770965218544006, + "learning_rate": 1.5269247579836162e-06, + "loss": 0.07547505497932434, + "mean_token_accuracy": 0.9772329092025757, + "num_tokens": 11106761.0, + "step": 3960 + }, + { + "epoch": 9.54282267792521, + "eval_entropy": 0.24866497918461147, + "eval_loss": 0.8349169492721558, + "eval_mean_token_accuracy": 0.8455062398080075, + "eval_num_tokens": 11106761.0, + "eval_runtime": 55.1484, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3960 + }, + { + "entropy": 0.13437952492386102, + "epoch": 9.591073582629674, + "grad_norm": 0.5746839046478271, + "learning_rate": 1.2244148098023241e-06, + "loss": 0.0719214141368866, + "mean_token_accuracy": 0.9783813208341599, + "num_tokens": 11163241.0, + "step": 3980 + }, + { + "epoch": 9.591073582629674, + "eval_entropy": 0.2487325757909357, + "eval_loss": 0.8345232009887695, + "eval_mean_token_accuracy": 0.8452854909923639, + "eval_num_tokens": 11163241.0, + "eval_runtime": 55.2029, + "eval_samples_per_second": 25.723, + "eval_steps_per_second": 3.224, + "step": 3980 + }, + { + "entropy": 0.1449177075177431, + "epoch": 9.639324487334138, + "grad_norm": 0.7029407024383545, + "learning_rate": 9.551139612183896e-07, + "loss": 0.08021060228347779, + "mean_token_accuracy": 0.9748102590441704, + "num_tokens": 11215311.0, + "step": 4000 + }, + { + "epoch": 9.639324487334138, + "eval_entropy": 0.24849342898036655, + "eval_loss": 0.8350681066513062, + "eval_mean_token_accuracy": 0.8454755872822879, + "eval_num_tokens": 11215311.0, + "eval_runtime": 55.1818, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 4000 + }, + { + "entropy": 0.13718088436871767, + "epoch": 9.687575392038601, + "grad_norm": 0.4247730076313019, + "learning_rate": 7.190984211864178e-07, + "loss": 0.0763831913471222, + "mean_token_accuracy": 0.9777001023292542, + "num_tokens": 11272587.0, + "step": 4020 + }, + { + "epoch": 9.687575392038601, + "eval_entropy": 0.2485166782241189, + "eval_loss": 0.8347740769386292, + "eval_mean_token_accuracy": 0.8455832382936156, + "eval_num_tokens": 11272587.0, + "eval_runtime": 55.1819, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 4020 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.064133544766464e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..96ffb71ad5524373a3bddde46cfec956684cad63 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/trainer_state.json @@ -0,0 +1,4276 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.735826296743063, + "eval_steps": 20, + "global_step": 4040, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + }, + { + "entropy": 0.15672272052615882, + "epoch": 8.916767189384801, + "grad_norm": 0.610146701335907, + "learning_rate": 8.429930072725457e-06, + "loss": 0.09335047006607056, + "mean_token_accuracy": 0.9709506019949913, + "num_tokens": 10377086.0, + "step": 3700 + }, + { + "epoch": 8.916767189384801, + "eval_entropy": 0.26106021611878044, + "eval_loss": 0.7824530005455017, + "eval_mean_token_accuracy": 0.8467274777005228, + "eval_num_tokens": 10377086.0, + "eval_runtime": 55.1677, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3700 + }, + { + "entropy": 0.15496804118156432, + "epoch": 8.965018094089265, + "grad_norm": 0.8847843408584595, + "learning_rate": 7.706871844646178e-06, + "loss": 0.09552072882652282, + "mean_token_accuracy": 0.9693930178880692, + "num_tokens": 10435272.0, + "step": 3720 + }, + { + "epoch": 8.965018094089265, + "eval_entropy": 0.26175538701622675, + "eval_loss": 0.7833470106124878, + "eval_mean_token_accuracy": 0.8464220507761065, + "eval_num_tokens": 10435272.0, + "eval_runtime": 55.1496, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3720 + }, + { + "entropy": 0.15739625711471605, + "epoch": 9.012062726176115, + "grad_norm": 0.35802221298217773, + "learning_rate": 7.0151882575034775e-06, + "loss": 0.09389110803604125, + "mean_token_accuracy": 0.9715636464265677, + "num_tokens": 10487076.0, + "step": 3740 + }, + { + "epoch": 9.012062726176115, + "eval_entropy": 0.26084648198291155, + "eval_loss": 0.7853291034698486, + "eval_mean_token_accuracy": 0.8463665585169632, + "eval_num_tokens": 10487076.0, + "eval_runtime": 55.1572, + "eval_samples_per_second": 25.745, + "eval_steps_per_second": 3.227, + "step": 3740 + }, + { + "entropy": 0.14215838070958853, + "epoch": 9.060313630880579, + "grad_norm": 0.4684004783630371, + "learning_rate": 6.35507504957069e-06, + "loss": 0.07583575248718262, + "mean_token_accuracy": 0.9782516390085221, + "num_tokens": 10543520.0, + "step": 3760 + }, + { + "epoch": 9.060313630880579, + "eval_entropy": 0.254280548333452, + "eval_loss": 0.807111382484436, + "eval_mean_token_accuracy": 0.8463215992022096, + "eval_num_tokens": 10543520.0, + "eval_runtime": 55.1805, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3760 + }, + { + "entropy": 0.1408092312514782, + "epoch": 9.108564535585042, + "grad_norm": 0.43550431728363037, + "learning_rate": 5.726719025077231e-06, + "loss": 0.07781847715377807, + "mean_token_accuracy": 0.9768255725502968, + "num_tokens": 10596528.0, + "step": 3780 + }, + { + "epoch": 9.108564535585042, + "eval_entropy": 0.2505798229340757, + "eval_loss": 0.8224650025367737, + "eval_mean_token_accuracy": 0.8463811020502884, + "eval_num_tokens": 10596528.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3780 + }, + { + "entropy": 0.13897503707557918, + "epoch": 9.156815440289506, + "grad_norm": 0.5255228281021118, + "learning_rate": 5.130298001345343e-06, + "loss": 0.07528382539749146, + "mean_token_accuracy": 0.9768329098820686, + "num_tokens": 10655544.0, + "step": 3800 + }, + { + "epoch": 9.156815440289506, + "eval_entropy": 0.2509520294960965, + "eval_loss": 0.8247353434562683, + "eval_mean_token_accuracy": 0.845786559112956, + "eval_num_tokens": 10655544.0, + "eval_runtime": 55.151, + "eval_samples_per_second": 25.747, + "eval_steps_per_second": 3.228, + "step": 3800 + }, + { + "entropy": 0.13998530581593513, + "epoch": 9.205066344993968, + "grad_norm": 0.4610442817211151, + "learning_rate": 4.565980758469731e-06, + "loss": 0.07813523411750793, + "mean_token_accuracy": 0.9769201070070267, + "num_tokens": 10710737.0, + "step": 3820 + }, + { + "epoch": 9.205066344993968, + "eval_entropy": 0.2503350620691696, + "eval_loss": 0.8271610736846924, + "eval_mean_token_accuracy": 0.8455204879969693, + "eval_num_tokens": 10710737.0, + "eval_runtime": 55.1706, + "eval_samples_per_second": 25.738, + "eval_steps_per_second": 3.226, + "step": 3820 + }, + { + "entropy": 0.13162653651088477, + "epoch": 9.253317249698432, + "grad_norm": 0.3627360463142395, + "learning_rate": 4.033926991554922e-06, + "loss": 0.07141604423522949, + "mean_token_accuracy": 0.9792275875806808, + "num_tokens": 10771727.0, + "step": 3840 + }, + { + "epoch": 9.253317249698432, + "eval_entropy": 0.24897396087311627, + "eval_loss": 0.8307036757469177, + "eval_mean_token_accuracy": 0.8453606835911783, + "eval_num_tokens": 10771727.0, + "eval_runtime": 55.168, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3840 + }, + { + "entropy": 0.14422765467315912, + "epoch": 9.301568154402895, + "grad_norm": 0.5706251263618469, + "learning_rate": 3.53428726552335e-06, + "loss": 0.07931464314460754, + "mean_token_accuracy": 0.9752222061157226, + "num_tokens": 10825849.0, + "step": 3860 + }, + { + "epoch": 9.301568154402895, + "eval_entropy": 0.24914598389622872, + "eval_loss": 0.8312752842903137, + "eval_mean_token_accuracy": 0.8455510604917333, + "eval_num_tokens": 10825849.0, + "eval_runtime": 55.1849, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3860 + }, + { + "entropy": 0.13540438804775476, + "epoch": 9.349819059107359, + "grad_norm": 0.6334630250930786, + "learning_rate": 3.0672029725073196e-06, + "loss": 0.07548695802688599, + "mean_token_accuracy": 0.9778543844819069, + "num_tokens": 10884611.0, + "step": 3880 + }, + { + "epoch": 9.349819059107359, + "eval_entropy": 0.24910795722114906, + "eval_loss": 0.8316059112548828, + "eval_mean_token_accuracy": 0.8456973557391864, + "eval_num_tokens": 10884611.0, + "eval_runtime": 55.1728, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3880 + }, + { + "entropy": 0.1410717975348234, + "epoch": 9.398069963811821, + "grad_norm": 0.5009909868240356, + "learning_rate": 2.632806291836666e-06, + "loss": 0.07720760703086853, + "mean_token_accuracy": 0.9768648758530617, + "num_tokens": 10938300.0, + "step": 3900 + }, + { + "epoch": 9.398069963811821, + "eval_entropy": 0.24874219742048992, + "eval_loss": 0.8328408598899841, + "eval_mean_token_accuracy": 0.8455627888775943, + "eval_num_tokens": 10938300.0, + "eval_runtime": 55.1736, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3900 + }, + { + "entropy": 0.13674762714654207, + "epoch": 9.446320868516285, + "grad_norm": 0.46003177762031555, + "learning_rate": 2.231220152633621e-06, + "loss": 0.07583877444267273, + "mean_token_accuracy": 0.9772356480360032, + "num_tokens": 10994442.0, + "step": 3920 + }, + { + "epoch": 9.446320868516285, + "eval_entropy": 0.24896439030933915, + "eval_loss": 0.8331801891326904, + "eval_mean_token_accuracy": 0.8453999262177543, + "eval_num_tokens": 10994442.0, + "eval_runtime": 55.1815, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3920 + }, + { + "entropy": 0.1378044320270419, + "epoch": 9.494571773220748, + "grad_norm": 0.5056300759315491, + "learning_rate": 1.862558199025263e-06, + "loss": 0.07420622110366822, + "mean_token_accuracy": 0.9771117404103279, + "num_tokens": 11052708.0, + "step": 3940 + }, + { + "epoch": 9.494571773220748, + "eval_entropy": 0.2490867761413703, + "eval_loss": 0.8338391780853271, + "eval_mean_token_accuracy": 0.8453461571355884, + "eval_num_tokens": 11052708.0, + "eval_runtime": 59.1613, + "eval_samples_per_second": 24.002, + "eval_steps_per_second": 3.009, + "step": 3940 + }, + { + "entropy": 0.13731490727514029, + "epoch": 9.54282267792521, + "grad_norm": 0.5770965218544006, + "learning_rate": 1.5269247579836162e-06, + "loss": 0.07547505497932434, + "mean_token_accuracy": 0.9772329092025757, + "num_tokens": 11106761.0, + "step": 3960 + }, + { + "epoch": 9.54282267792521, + "eval_entropy": 0.24866497918461147, + "eval_loss": 0.8349169492721558, + "eval_mean_token_accuracy": 0.8455062398080075, + "eval_num_tokens": 11106761.0, + "eval_runtime": 55.1484, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3960 + }, + { + "entropy": 0.13437952492386102, + "epoch": 9.591073582629674, + "grad_norm": 0.5746839046478271, + "learning_rate": 1.2244148098023241e-06, + "loss": 0.0719214141368866, + "mean_token_accuracy": 0.9783813208341599, + "num_tokens": 11163241.0, + "step": 3980 + }, + { + "epoch": 9.591073582629674, + "eval_entropy": 0.2487325757909357, + "eval_loss": 0.8345232009887695, + "eval_mean_token_accuracy": 0.8452854909923639, + "eval_num_tokens": 11163241.0, + "eval_runtime": 55.2029, + "eval_samples_per_second": 25.723, + "eval_steps_per_second": 3.224, + "step": 3980 + }, + { + "entropy": 0.1449177075177431, + "epoch": 9.639324487334138, + "grad_norm": 0.7029407024383545, + "learning_rate": 9.551139612183896e-07, + "loss": 0.08021060228347779, + "mean_token_accuracy": 0.9748102590441704, + "num_tokens": 11215311.0, + "step": 4000 + }, + { + "epoch": 9.639324487334138, + "eval_entropy": 0.24849342898036655, + "eval_loss": 0.8350681066513062, + "eval_mean_token_accuracy": 0.8454755872822879, + "eval_num_tokens": 11215311.0, + "eval_runtime": 55.1818, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 4000 + }, + { + "entropy": 0.13718088436871767, + "epoch": 9.687575392038601, + "grad_norm": 0.4247730076313019, + "learning_rate": 7.190984211864178e-07, + "loss": 0.0763831913471222, + "mean_token_accuracy": 0.9777001023292542, + "num_tokens": 11272587.0, + "step": 4020 + }, + { + "epoch": 9.687575392038601, + "eval_entropy": 0.2485166782241189, + "eval_loss": 0.8347740769386292, + "eval_mean_token_accuracy": 0.8455832382936156, + "eval_num_tokens": 11272587.0, + "eval_runtime": 55.1819, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 4020 + }, + { + "entropy": 0.1414831655099988, + "epoch": 9.735826296743063, + "grad_norm": 0.4344032108783722, + "learning_rate": 5.164349793124746e-07, + "loss": 0.0786937952041626, + "mean_token_accuracy": 0.9762448608875275, + "num_tokens": 11326845.0, + "step": 4040 + }, + { + "epoch": 9.735826296743063, + "eval_entropy": 0.2485147834326444, + "eval_loss": 0.8348782658576965, + "eval_mean_token_accuracy": 0.8454727480250798, + "eval_num_tokens": 11326845.0, + "eval_runtime": 55.1896, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 4040 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.094393956537754e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f4915d4c9422633732b78d88ce7ed1a10fdc0383 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/trainer_state.json @@ -0,0 +1,4297 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.784077201447527, + "eval_steps": 20, + "global_step": 4060, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + }, + { + "entropy": 0.15672272052615882, + "epoch": 8.916767189384801, + "grad_norm": 0.610146701335907, + "learning_rate": 8.429930072725457e-06, + "loss": 0.09335047006607056, + "mean_token_accuracy": 0.9709506019949913, + "num_tokens": 10377086.0, + "step": 3700 + }, + { + "epoch": 8.916767189384801, + "eval_entropy": 0.26106021611878044, + "eval_loss": 0.7824530005455017, + "eval_mean_token_accuracy": 0.8467274777005228, + "eval_num_tokens": 10377086.0, + "eval_runtime": 55.1677, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3700 + }, + { + "entropy": 0.15496804118156432, + "epoch": 8.965018094089265, + "grad_norm": 0.8847843408584595, + "learning_rate": 7.706871844646178e-06, + "loss": 0.09552072882652282, + "mean_token_accuracy": 0.9693930178880692, + "num_tokens": 10435272.0, + "step": 3720 + }, + { + "epoch": 8.965018094089265, + "eval_entropy": 0.26175538701622675, + "eval_loss": 0.7833470106124878, + "eval_mean_token_accuracy": 0.8464220507761065, + "eval_num_tokens": 10435272.0, + "eval_runtime": 55.1496, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3720 + }, + { + "entropy": 0.15739625711471605, + "epoch": 9.012062726176115, + "grad_norm": 0.35802221298217773, + "learning_rate": 7.0151882575034775e-06, + "loss": 0.09389110803604125, + "mean_token_accuracy": 0.9715636464265677, + "num_tokens": 10487076.0, + "step": 3740 + }, + { + "epoch": 9.012062726176115, + "eval_entropy": 0.26084648198291155, + "eval_loss": 0.7853291034698486, + "eval_mean_token_accuracy": 0.8463665585169632, + "eval_num_tokens": 10487076.0, + "eval_runtime": 55.1572, + "eval_samples_per_second": 25.745, + "eval_steps_per_second": 3.227, + "step": 3740 + }, + { + "entropy": 0.14215838070958853, + "epoch": 9.060313630880579, + "grad_norm": 0.4684004783630371, + "learning_rate": 6.35507504957069e-06, + "loss": 0.07583575248718262, + "mean_token_accuracy": 0.9782516390085221, + "num_tokens": 10543520.0, + "step": 3760 + }, + { + "epoch": 9.060313630880579, + "eval_entropy": 0.254280548333452, + "eval_loss": 0.807111382484436, + "eval_mean_token_accuracy": 0.8463215992022096, + "eval_num_tokens": 10543520.0, + "eval_runtime": 55.1805, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3760 + }, + { + "entropy": 0.1408092312514782, + "epoch": 9.108564535585042, + "grad_norm": 0.43550431728363037, + "learning_rate": 5.726719025077231e-06, + "loss": 0.07781847715377807, + "mean_token_accuracy": 0.9768255725502968, + "num_tokens": 10596528.0, + "step": 3780 + }, + { + "epoch": 9.108564535585042, + "eval_entropy": 0.2505798229340757, + "eval_loss": 0.8224650025367737, + "eval_mean_token_accuracy": 0.8463811020502884, + "eval_num_tokens": 10596528.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3780 + }, + { + "entropy": 0.13897503707557918, + "epoch": 9.156815440289506, + "grad_norm": 0.5255228281021118, + "learning_rate": 5.130298001345343e-06, + "loss": 0.07528382539749146, + "mean_token_accuracy": 0.9768329098820686, + "num_tokens": 10655544.0, + "step": 3800 + }, + { + "epoch": 9.156815440289506, + "eval_entropy": 0.2509520294960965, + "eval_loss": 0.8247353434562683, + "eval_mean_token_accuracy": 0.845786559112956, + "eval_num_tokens": 10655544.0, + "eval_runtime": 55.151, + "eval_samples_per_second": 25.747, + "eval_steps_per_second": 3.228, + "step": 3800 + }, + { + "entropy": 0.13998530581593513, + "epoch": 9.205066344993968, + "grad_norm": 0.4610442817211151, + "learning_rate": 4.565980758469731e-06, + "loss": 0.07813523411750793, + "mean_token_accuracy": 0.9769201070070267, + "num_tokens": 10710737.0, + "step": 3820 + }, + { + "epoch": 9.205066344993968, + "eval_entropy": 0.2503350620691696, + "eval_loss": 0.8271610736846924, + "eval_mean_token_accuracy": 0.8455204879969693, + "eval_num_tokens": 10710737.0, + "eval_runtime": 55.1706, + "eval_samples_per_second": 25.738, + "eval_steps_per_second": 3.226, + "step": 3820 + }, + { + "entropy": 0.13162653651088477, + "epoch": 9.253317249698432, + "grad_norm": 0.3627360463142395, + "learning_rate": 4.033926991554922e-06, + "loss": 0.07141604423522949, + "mean_token_accuracy": 0.9792275875806808, + "num_tokens": 10771727.0, + "step": 3840 + }, + { + "epoch": 9.253317249698432, + "eval_entropy": 0.24897396087311627, + "eval_loss": 0.8307036757469177, + "eval_mean_token_accuracy": 0.8453606835911783, + "eval_num_tokens": 10771727.0, + "eval_runtime": 55.168, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3840 + }, + { + "entropy": 0.14422765467315912, + "epoch": 9.301568154402895, + "grad_norm": 0.5706251263618469, + "learning_rate": 3.53428726552335e-06, + "loss": 0.07931464314460754, + "mean_token_accuracy": 0.9752222061157226, + "num_tokens": 10825849.0, + "step": 3860 + }, + { + "epoch": 9.301568154402895, + "eval_entropy": 0.24914598389622872, + "eval_loss": 0.8312752842903137, + "eval_mean_token_accuracy": 0.8455510604917333, + "eval_num_tokens": 10825849.0, + "eval_runtime": 55.1849, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3860 + }, + { + "entropy": 0.13540438804775476, + "epoch": 9.349819059107359, + "grad_norm": 0.6334630250930786, + "learning_rate": 3.0672029725073196e-06, + "loss": 0.07548695802688599, + "mean_token_accuracy": 0.9778543844819069, + "num_tokens": 10884611.0, + "step": 3880 + }, + { + "epoch": 9.349819059107359, + "eval_entropy": 0.24910795722114906, + "eval_loss": 0.8316059112548828, + "eval_mean_token_accuracy": 0.8456973557391864, + "eval_num_tokens": 10884611.0, + "eval_runtime": 55.1728, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3880 + }, + { + "entropy": 0.1410717975348234, + "epoch": 9.398069963811821, + "grad_norm": 0.5009909868240356, + "learning_rate": 2.632806291836666e-06, + "loss": 0.07720760703086853, + "mean_token_accuracy": 0.9768648758530617, + "num_tokens": 10938300.0, + "step": 3900 + }, + { + "epoch": 9.398069963811821, + "eval_entropy": 0.24874219742048992, + "eval_loss": 0.8328408598899841, + "eval_mean_token_accuracy": 0.8455627888775943, + "eval_num_tokens": 10938300.0, + "eval_runtime": 55.1736, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3900 + }, + { + "entropy": 0.13674762714654207, + "epoch": 9.446320868516285, + "grad_norm": 0.46003177762031555, + "learning_rate": 2.231220152633621e-06, + "loss": 0.07583877444267273, + "mean_token_accuracy": 0.9772356480360032, + "num_tokens": 10994442.0, + "step": 3920 + }, + { + "epoch": 9.446320868516285, + "eval_entropy": 0.24896439030933915, + "eval_loss": 0.8331801891326904, + "eval_mean_token_accuracy": 0.8453999262177543, + "eval_num_tokens": 10994442.0, + "eval_runtime": 55.1815, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3920 + }, + { + "entropy": 0.1378044320270419, + "epoch": 9.494571773220748, + "grad_norm": 0.5056300759315491, + "learning_rate": 1.862558199025263e-06, + "loss": 0.07420622110366822, + "mean_token_accuracy": 0.9771117404103279, + "num_tokens": 11052708.0, + "step": 3940 + }, + { + "epoch": 9.494571773220748, + "eval_entropy": 0.2490867761413703, + "eval_loss": 0.8338391780853271, + "eval_mean_token_accuracy": 0.8453461571355884, + "eval_num_tokens": 11052708.0, + "eval_runtime": 59.1613, + "eval_samples_per_second": 24.002, + "eval_steps_per_second": 3.009, + "step": 3940 + }, + { + "entropy": 0.13731490727514029, + "epoch": 9.54282267792521, + "grad_norm": 0.5770965218544006, + "learning_rate": 1.5269247579836162e-06, + "loss": 0.07547505497932434, + "mean_token_accuracy": 0.9772329092025757, + "num_tokens": 11106761.0, + "step": 3960 + }, + { + "epoch": 9.54282267792521, + "eval_entropy": 0.24866497918461147, + "eval_loss": 0.8349169492721558, + "eval_mean_token_accuracy": 0.8455062398080075, + "eval_num_tokens": 11106761.0, + "eval_runtime": 55.1484, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3960 + }, + { + "entropy": 0.13437952492386102, + "epoch": 9.591073582629674, + "grad_norm": 0.5746839046478271, + "learning_rate": 1.2244148098023241e-06, + "loss": 0.0719214141368866, + "mean_token_accuracy": 0.9783813208341599, + "num_tokens": 11163241.0, + "step": 3980 + }, + { + "epoch": 9.591073582629674, + "eval_entropy": 0.2487325757909357, + "eval_loss": 0.8345232009887695, + "eval_mean_token_accuracy": 0.8452854909923639, + "eval_num_tokens": 11163241.0, + "eval_runtime": 55.2029, + "eval_samples_per_second": 25.723, + "eval_steps_per_second": 3.224, + "step": 3980 + }, + { + "entropy": 0.1449177075177431, + "epoch": 9.639324487334138, + "grad_norm": 0.7029407024383545, + "learning_rate": 9.551139612183896e-07, + "loss": 0.08021060228347779, + "mean_token_accuracy": 0.9748102590441704, + "num_tokens": 11215311.0, + "step": 4000 + }, + { + "epoch": 9.639324487334138, + "eval_entropy": 0.24849342898036655, + "eval_loss": 0.8350681066513062, + "eval_mean_token_accuracy": 0.8454755872822879, + "eval_num_tokens": 11215311.0, + "eval_runtime": 55.1818, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 4000 + }, + { + "entropy": 0.13718088436871767, + "epoch": 9.687575392038601, + "grad_norm": 0.4247730076313019, + "learning_rate": 7.190984211864178e-07, + "loss": 0.0763831913471222, + "mean_token_accuracy": 0.9777001023292542, + "num_tokens": 11272587.0, + "step": 4020 + }, + { + "epoch": 9.687575392038601, + "eval_entropy": 0.2485166782241189, + "eval_loss": 0.8347740769386292, + "eval_mean_token_accuracy": 0.8455832382936156, + "eval_num_tokens": 11272587.0, + "eval_runtime": 55.1819, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 4020 + }, + { + "entropy": 0.1414831655099988, + "epoch": 9.735826296743063, + "grad_norm": 0.4344032108783722, + "learning_rate": 5.164349793124746e-07, + "loss": 0.0786937952041626, + "mean_token_accuracy": 0.9762448608875275, + "num_tokens": 11326845.0, + "step": 4040 + }, + { + "epoch": 9.735826296743063, + "eval_entropy": 0.2485147834326444, + "eval_loss": 0.8348782658576965, + "eval_mean_token_accuracy": 0.8454727480250798, + "eval_num_tokens": 11326845.0, + "eval_runtime": 55.1896, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 4040 + }, + { + "entropy": 0.1397345969453454, + "epoch": 9.784077201447527, + "grad_norm": 0.5865362882614136, + "learning_rate": 3.4718098695330847e-07, + "loss": 0.07839923501014709, + "mean_token_accuracy": 0.9766460061073303, + "num_tokens": 11381321.0, + "step": 4060 + }, + { + "epoch": 9.784077201447527, + "eval_entropy": 0.24835219778371662, + "eval_loss": 0.8349990248680115, + "eval_mean_token_accuracy": 0.8452944608216875, + "eval_num_tokens": 11381321.0, + "eval_runtime": 55.1862, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 4060 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.124510053212774e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2f73da204a790970671f1252563210ae5cdf9f67 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/trainer_state.json @@ -0,0 +1,4318 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.83232810615199, + "eval_steps": 20, + "global_step": 4080, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + }, + { + "entropy": 0.15672272052615882, + "epoch": 8.916767189384801, + "grad_norm": 0.610146701335907, + "learning_rate": 8.429930072725457e-06, + "loss": 0.09335047006607056, + "mean_token_accuracy": 0.9709506019949913, + "num_tokens": 10377086.0, + "step": 3700 + }, + { + "epoch": 8.916767189384801, + "eval_entropy": 0.26106021611878044, + "eval_loss": 0.7824530005455017, + "eval_mean_token_accuracy": 0.8467274777005228, + "eval_num_tokens": 10377086.0, + "eval_runtime": 55.1677, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3700 + }, + { + "entropy": 0.15496804118156432, + "epoch": 8.965018094089265, + "grad_norm": 0.8847843408584595, + "learning_rate": 7.706871844646178e-06, + "loss": 0.09552072882652282, + "mean_token_accuracy": 0.9693930178880692, + "num_tokens": 10435272.0, + "step": 3720 + }, + { + "epoch": 8.965018094089265, + "eval_entropy": 0.26175538701622675, + "eval_loss": 0.7833470106124878, + "eval_mean_token_accuracy": 0.8464220507761065, + "eval_num_tokens": 10435272.0, + "eval_runtime": 55.1496, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3720 + }, + { + "entropy": 0.15739625711471605, + "epoch": 9.012062726176115, + "grad_norm": 0.35802221298217773, + "learning_rate": 7.0151882575034775e-06, + "loss": 0.09389110803604125, + "mean_token_accuracy": 0.9715636464265677, + "num_tokens": 10487076.0, + "step": 3740 + }, + { + "epoch": 9.012062726176115, + "eval_entropy": 0.26084648198291155, + "eval_loss": 0.7853291034698486, + "eval_mean_token_accuracy": 0.8463665585169632, + "eval_num_tokens": 10487076.0, + "eval_runtime": 55.1572, + "eval_samples_per_second": 25.745, + "eval_steps_per_second": 3.227, + "step": 3740 + }, + { + "entropy": 0.14215838070958853, + "epoch": 9.060313630880579, + "grad_norm": 0.4684004783630371, + "learning_rate": 6.35507504957069e-06, + "loss": 0.07583575248718262, + "mean_token_accuracy": 0.9782516390085221, + "num_tokens": 10543520.0, + "step": 3760 + }, + { + "epoch": 9.060313630880579, + "eval_entropy": 0.254280548333452, + "eval_loss": 0.807111382484436, + "eval_mean_token_accuracy": 0.8463215992022096, + "eval_num_tokens": 10543520.0, + "eval_runtime": 55.1805, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3760 + }, + { + "entropy": 0.1408092312514782, + "epoch": 9.108564535585042, + "grad_norm": 0.43550431728363037, + "learning_rate": 5.726719025077231e-06, + "loss": 0.07781847715377807, + "mean_token_accuracy": 0.9768255725502968, + "num_tokens": 10596528.0, + "step": 3780 + }, + { + "epoch": 9.108564535585042, + "eval_entropy": 0.2505798229340757, + "eval_loss": 0.8224650025367737, + "eval_mean_token_accuracy": 0.8463811020502884, + "eval_num_tokens": 10596528.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3780 + }, + { + "entropy": 0.13897503707557918, + "epoch": 9.156815440289506, + "grad_norm": 0.5255228281021118, + "learning_rate": 5.130298001345343e-06, + "loss": 0.07528382539749146, + "mean_token_accuracy": 0.9768329098820686, + "num_tokens": 10655544.0, + "step": 3800 + }, + { + "epoch": 9.156815440289506, + "eval_entropy": 0.2509520294960965, + "eval_loss": 0.8247353434562683, + "eval_mean_token_accuracy": 0.845786559112956, + "eval_num_tokens": 10655544.0, + "eval_runtime": 55.151, + "eval_samples_per_second": 25.747, + "eval_steps_per_second": 3.228, + "step": 3800 + }, + { + "entropy": 0.13998530581593513, + "epoch": 9.205066344993968, + "grad_norm": 0.4610442817211151, + "learning_rate": 4.565980758469731e-06, + "loss": 0.07813523411750793, + "mean_token_accuracy": 0.9769201070070267, + "num_tokens": 10710737.0, + "step": 3820 + }, + { + "epoch": 9.205066344993968, + "eval_entropy": 0.2503350620691696, + "eval_loss": 0.8271610736846924, + "eval_mean_token_accuracy": 0.8455204879969693, + "eval_num_tokens": 10710737.0, + "eval_runtime": 55.1706, + "eval_samples_per_second": 25.738, + "eval_steps_per_second": 3.226, + "step": 3820 + }, + { + "entropy": 0.13162653651088477, + "epoch": 9.253317249698432, + "grad_norm": 0.3627360463142395, + "learning_rate": 4.033926991554922e-06, + "loss": 0.07141604423522949, + "mean_token_accuracy": 0.9792275875806808, + "num_tokens": 10771727.0, + "step": 3840 + }, + { + "epoch": 9.253317249698432, + "eval_entropy": 0.24897396087311627, + "eval_loss": 0.8307036757469177, + "eval_mean_token_accuracy": 0.8453606835911783, + "eval_num_tokens": 10771727.0, + "eval_runtime": 55.168, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3840 + }, + { + "entropy": 0.14422765467315912, + "epoch": 9.301568154402895, + "grad_norm": 0.5706251263618469, + "learning_rate": 3.53428726552335e-06, + "loss": 0.07931464314460754, + "mean_token_accuracy": 0.9752222061157226, + "num_tokens": 10825849.0, + "step": 3860 + }, + { + "epoch": 9.301568154402895, + "eval_entropy": 0.24914598389622872, + "eval_loss": 0.8312752842903137, + "eval_mean_token_accuracy": 0.8455510604917333, + "eval_num_tokens": 10825849.0, + "eval_runtime": 55.1849, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3860 + }, + { + "entropy": 0.13540438804775476, + "epoch": 9.349819059107359, + "grad_norm": 0.6334630250930786, + "learning_rate": 3.0672029725073196e-06, + "loss": 0.07548695802688599, + "mean_token_accuracy": 0.9778543844819069, + "num_tokens": 10884611.0, + "step": 3880 + }, + { + "epoch": 9.349819059107359, + "eval_entropy": 0.24910795722114906, + "eval_loss": 0.8316059112548828, + "eval_mean_token_accuracy": 0.8456973557391864, + "eval_num_tokens": 10884611.0, + "eval_runtime": 55.1728, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3880 + }, + { + "entropy": 0.1410717975348234, + "epoch": 9.398069963811821, + "grad_norm": 0.5009909868240356, + "learning_rate": 2.632806291836666e-06, + "loss": 0.07720760703086853, + "mean_token_accuracy": 0.9768648758530617, + "num_tokens": 10938300.0, + "step": 3900 + }, + { + "epoch": 9.398069963811821, + "eval_entropy": 0.24874219742048992, + "eval_loss": 0.8328408598899841, + "eval_mean_token_accuracy": 0.8455627888775943, + "eval_num_tokens": 10938300.0, + "eval_runtime": 55.1736, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3900 + }, + { + "entropy": 0.13674762714654207, + "epoch": 9.446320868516285, + "grad_norm": 0.46003177762031555, + "learning_rate": 2.231220152633621e-06, + "loss": 0.07583877444267273, + "mean_token_accuracy": 0.9772356480360032, + "num_tokens": 10994442.0, + "step": 3920 + }, + { + "epoch": 9.446320868516285, + "eval_entropy": 0.24896439030933915, + "eval_loss": 0.8331801891326904, + "eval_mean_token_accuracy": 0.8453999262177543, + "eval_num_tokens": 10994442.0, + "eval_runtime": 55.1815, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3920 + }, + { + "entropy": 0.1378044320270419, + "epoch": 9.494571773220748, + "grad_norm": 0.5056300759315491, + "learning_rate": 1.862558199025263e-06, + "loss": 0.07420622110366822, + "mean_token_accuracy": 0.9771117404103279, + "num_tokens": 11052708.0, + "step": 3940 + }, + { + "epoch": 9.494571773220748, + "eval_entropy": 0.2490867761413703, + "eval_loss": 0.8338391780853271, + "eval_mean_token_accuracy": 0.8453461571355884, + "eval_num_tokens": 11052708.0, + "eval_runtime": 59.1613, + "eval_samples_per_second": 24.002, + "eval_steps_per_second": 3.009, + "step": 3940 + }, + { + "entropy": 0.13731490727514029, + "epoch": 9.54282267792521, + "grad_norm": 0.5770965218544006, + "learning_rate": 1.5269247579836162e-06, + "loss": 0.07547505497932434, + "mean_token_accuracy": 0.9772329092025757, + "num_tokens": 11106761.0, + "step": 3960 + }, + { + "epoch": 9.54282267792521, + "eval_entropy": 0.24866497918461147, + "eval_loss": 0.8349169492721558, + "eval_mean_token_accuracy": 0.8455062398080075, + "eval_num_tokens": 11106761.0, + "eval_runtime": 55.1484, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3960 + }, + { + "entropy": 0.13437952492386102, + "epoch": 9.591073582629674, + "grad_norm": 0.5746839046478271, + "learning_rate": 1.2244148098023241e-06, + "loss": 0.0719214141368866, + "mean_token_accuracy": 0.9783813208341599, + "num_tokens": 11163241.0, + "step": 3980 + }, + { + "epoch": 9.591073582629674, + "eval_entropy": 0.2487325757909357, + "eval_loss": 0.8345232009887695, + "eval_mean_token_accuracy": 0.8452854909923639, + "eval_num_tokens": 11163241.0, + "eval_runtime": 55.2029, + "eval_samples_per_second": 25.723, + "eval_steps_per_second": 3.224, + "step": 3980 + }, + { + "entropy": 0.1449177075177431, + "epoch": 9.639324487334138, + "grad_norm": 0.7029407024383545, + "learning_rate": 9.551139612183896e-07, + "loss": 0.08021060228347779, + "mean_token_accuracy": 0.9748102590441704, + "num_tokens": 11215311.0, + "step": 4000 + }, + { + "epoch": 9.639324487334138, + "eval_entropy": 0.24849342898036655, + "eval_loss": 0.8350681066513062, + "eval_mean_token_accuracy": 0.8454755872822879, + "eval_num_tokens": 11215311.0, + "eval_runtime": 55.1818, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 4000 + }, + { + "entropy": 0.13718088436871767, + "epoch": 9.687575392038601, + "grad_norm": 0.4247730076313019, + "learning_rate": 7.190984211864178e-07, + "loss": 0.0763831913471222, + "mean_token_accuracy": 0.9777001023292542, + "num_tokens": 11272587.0, + "step": 4020 + }, + { + "epoch": 9.687575392038601, + "eval_entropy": 0.2485166782241189, + "eval_loss": 0.8347740769386292, + "eval_mean_token_accuracy": 0.8455832382936156, + "eval_num_tokens": 11272587.0, + "eval_runtime": 55.1819, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 4020 + }, + { + "entropy": 0.1414831655099988, + "epoch": 9.735826296743063, + "grad_norm": 0.4344032108783722, + "learning_rate": 5.164349793124746e-07, + "loss": 0.0786937952041626, + "mean_token_accuracy": 0.9762448608875275, + "num_tokens": 11326845.0, + "step": 4040 + }, + { + "epoch": 9.735826296743063, + "eval_entropy": 0.2485147834326444, + "eval_loss": 0.8348782658576965, + "eval_mean_token_accuracy": 0.8454727480250798, + "eval_num_tokens": 11326845.0, + "eval_runtime": 55.1896, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 4040 + }, + { + "entropy": 0.1397345969453454, + "epoch": 9.784077201447527, + "grad_norm": 0.5865362882614136, + "learning_rate": 3.4718098695330847e-07, + "loss": 0.07839923501014709, + "mean_token_accuracy": 0.9766460061073303, + "num_tokens": 11381321.0, + "step": 4060 + }, + { + "epoch": 9.784077201447527, + "eval_entropy": 0.24835219778371662, + "eval_loss": 0.8349990248680115, + "eval_mean_token_accuracy": 0.8452944608216875, + "eval_num_tokens": 11381321.0, + "eval_runtime": 55.1862, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 4060 + }, + { + "entropy": 0.13877951726317406, + "epoch": 9.83232810615199, + "grad_norm": 0.36521604657173157, + "learning_rate": 2.1138434098667948e-07, + "loss": 0.0738587200641632, + "mean_token_accuracy": 0.9764896467328071, + "num_tokens": 11441968.0, + "step": 4080 + }, + { + "epoch": 9.83232810615199, + "eval_entropy": 0.24834532483240193, + "eval_loss": 0.8350111246109009, + "eval_mean_token_accuracy": 0.8456257598453694, + "eval_num_tokens": 11441968.0, + "eval_runtime": 55.1871, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 4080 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.158010514889318e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ef7c15e6c3905b6f953723c4cf34752ff6f6994a --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/trainer_state.json @@ -0,0 +1,4339 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.880579010856454, + "eval_steps": 20, + "global_step": 4100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + }, + { + "entropy": 0.15672272052615882, + "epoch": 8.916767189384801, + "grad_norm": 0.610146701335907, + "learning_rate": 8.429930072725457e-06, + "loss": 0.09335047006607056, + "mean_token_accuracy": 0.9709506019949913, + "num_tokens": 10377086.0, + "step": 3700 + }, + { + "epoch": 8.916767189384801, + "eval_entropy": 0.26106021611878044, + "eval_loss": 0.7824530005455017, + "eval_mean_token_accuracy": 0.8467274777005228, + "eval_num_tokens": 10377086.0, + "eval_runtime": 55.1677, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3700 + }, + { + "entropy": 0.15496804118156432, + "epoch": 8.965018094089265, + "grad_norm": 0.8847843408584595, + "learning_rate": 7.706871844646178e-06, + "loss": 0.09552072882652282, + "mean_token_accuracy": 0.9693930178880692, + "num_tokens": 10435272.0, + "step": 3720 + }, + { + "epoch": 8.965018094089265, + "eval_entropy": 0.26175538701622675, + "eval_loss": 0.7833470106124878, + "eval_mean_token_accuracy": 0.8464220507761065, + "eval_num_tokens": 10435272.0, + "eval_runtime": 55.1496, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3720 + }, + { + "entropy": 0.15739625711471605, + "epoch": 9.012062726176115, + "grad_norm": 0.35802221298217773, + "learning_rate": 7.0151882575034775e-06, + "loss": 0.09389110803604125, + "mean_token_accuracy": 0.9715636464265677, + "num_tokens": 10487076.0, + "step": 3740 + }, + { + "epoch": 9.012062726176115, + "eval_entropy": 0.26084648198291155, + "eval_loss": 0.7853291034698486, + "eval_mean_token_accuracy": 0.8463665585169632, + "eval_num_tokens": 10487076.0, + "eval_runtime": 55.1572, + "eval_samples_per_second": 25.745, + "eval_steps_per_second": 3.227, + "step": 3740 + }, + { + "entropy": 0.14215838070958853, + "epoch": 9.060313630880579, + "grad_norm": 0.4684004783630371, + "learning_rate": 6.35507504957069e-06, + "loss": 0.07583575248718262, + "mean_token_accuracy": 0.9782516390085221, + "num_tokens": 10543520.0, + "step": 3760 + }, + { + "epoch": 9.060313630880579, + "eval_entropy": 0.254280548333452, + "eval_loss": 0.807111382484436, + "eval_mean_token_accuracy": 0.8463215992022096, + "eval_num_tokens": 10543520.0, + "eval_runtime": 55.1805, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3760 + }, + { + "entropy": 0.1408092312514782, + "epoch": 9.108564535585042, + "grad_norm": 0.43550431728363037, + "learning_rate": 5.726719025077231e-06, + "loss": 0.07781847715377807, + "mean_token_accuracy": 0.9768255725502968, + "num_tokens": 10596528.0, + "step": 3780 + }, + { + "epoch": 9.108564535585042, + "eval_entropy": 0.2505798229340757, + "eval_loss": 0.8224650025367737, + "eval_mean_token_accuracy": 0.8463811020502884, + "eval_num_tokens": 10596528.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3780 + }, + { + "entropy": 0.13897503707557918, + "epoch": 9.156815440289506, + "grad_norm": 0.5255228281021118, + "learning_rate": 5.130298001345343e-06, + "loss": 0.07528382539749146, + "mean_token_accuracy": 0.9768329098820686, + "num_tokens": 10655544.0, + "step": 3800 + }, + { + "epoch": 9.156815440289506, + "eval_entropy": 0.2509520294960965, + "eval_loss": 0.8247353434562683, + "eval_mean_token_accuracy": 0.845786559112956, + "eval_num_tokens": 10655544.0, + "eval_runtime": 55.151, + "eval_samples_per_second": 25.747, + "eval_steps_per_second": 3.228, + "step": 3800 + }, + { + "entropy": 0.13998530581593513, + "epoch": 9.205066344993968, + "grad_norm": 0.4610442817211151, + "learning_rate": 4.565980758469731e-06, + "loss": 0.07813523411750793, + "mean_token_accuracy": 0.9769201070070267, + "num_tokens": 10710737.0, + "step": 3820 + }, + { + "epoch": 9.205066344993968, + "eval_entropy": 0.2503350620691696, + "eval_loss": 0.8271610736846924, + "eval_mean_token_accuracy": 0.8455204879969693, + "eval_num_tokens": 10710737.0, + "eval_runtime": 55.1706, + "eval_samples_per_second": 25.738, + "eval_steps_per_second": 3.226, + "step": 3820 + }, + { + "entropy": 0.13162653651088477, + "epoch": 9.253317249698432, + "grad_norm": 0.3627360463142395, + "learning_rate": 4.033926991554922e-06, + "loss": 0.07141604423522949, + "mean_token_accuracy": 0.9792275875806808, + "num_tokens": 10771727.0, + "step": 3840 + }, + { + "epoch": 9.253317249698432, + "eval_entropy": 0.24897396087311627, + "eval_loss": 0.8307036757469177, + "eval_mean_token_accuracy": 0.8453606835911783, + "eval_num_tokens": 10771727.0, + "eval_runtime": 55.168, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3840 + }, + { + "entropy": 0.14422765467315912, + "epoch": 9.301568154402895, + "grad_norm": 0.5706251263618469, + "learning_rate": 3.53428726552335e-06, + "loss": 0.07931464314460754, + "mean_token_accuracy": 0.9752222061157226, + "num_tokens": 10825849.0, + "step": 3860 + }, + { + "epoch": 9.301568154402895, + "eval_entropy": 0.24914598389622872, + "eval_loss": 0.8312752842903137, + "eval_mean_token_accuracy": 0.8455510604917333, + "eval_num_tokens": 10825849.0, + "eval_runtime": 55.1849, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3860 + }, + { + "entropy": 0.13540438804775476, + "epoch": 9.349819059107359, + "grad_norm": 0.6334630250930786, + "learning_rate": 3.0672029725073196e-06, + "loss": 0.07548695802688599, + "mean_token_accuracy": 0.9778543844819069, + "num_tokens": 10884611.0, + "step": 3880 + }, + { + "epoch": 9.349819059107359, + "eval_entropy": 0.24910795722114906, + "eval_loss": 0.8316059112548828, + "eval_mean_token_accuracy": 0.8456973557391864, + "eval_num_tokens": 10884611.0, + "eval_runtime": 55.1728, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3880 + }, + { + "entropy": 0.1410717975348234, + "epoch": 9.398069963811821, + "grad_norm": 0.5009909868240356, + "learning_rate": 2.632806291836666e-06, + "loss": 0.07720760703086853, + "mean_token_accuracy": 0.9768648758530617, + "num_tokens": 10938300.0, + "step": 3900 + }, + { + "epoch": 9.398069963811821, + "eval_entropy": 0.24874219742048992, + "eval_loss": 0.8328408598899841, + "eval_mean_token_accuracy": 0.8455627888775943, + "eval_num_tokens": 10938300.0, + "eval_runtime": 55.1736, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3900 + }, + { + "entropy": 0.13674762714654207, + "epoch": 9.446320868516285, + "grad_norm": 0.46003177762031555, + "learning_rate": 2.231220152633621e-06, + "loss": 0.07583877444267273, + "mean_token_accuracy": 0.9772356480360032, + "num_tokens": 10994442.0, + "step": 3920 + }, + { + "epoch": 9.446320868516285, + "eval_entropy": 0.24896439030933915, + "eval_loss": 0.8331801891326904, + "eval_mean_token_accuracy": 0.8453999262177543, + "eval_num_tokens": 10994442.0, + "eval_runtime": 55.1815, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3920 + }, + { + "entropy": 0.1378044320270419, + "epoch": 9.494571773220748, + "grad_norm": 0.5056300759315491, + "learning_rate": 1.862558199025263e-06, + "loss": 0.07420622110366822, + "mean_token_accuracy": 0.9771117404103279, + "num_tokens": 11052708.0, + "step": 3940 + }, + { + "epoch": 9.494571773220748, + "eval_entropy": 0.2490867761413703, + "eval_loss": 0.8338391780853271, + "eval_mean_token_accuracy": 0.8453461571355884, + "eval_num_tokens": 11052708.0, + "eval_runtime": 59.1613, + "eval_samples_per_second": 24.002, + "eval_steps_per_second": 3.009, + "step": 3940 + }, + { + "entropy": 0.13731490727514029, + "epoch": 9.54282267792521, + "grad_norm": 0.5770965218544006, + "learning_rate": 1.5269247579836162e-06, + "loss": 0.07547505497932434, + "mean_token_accuracy": 0.9772329092025757, + "num_tokens": 11106761.0, + "step": 3960 + }, + { + "epoch": 9.54282267792521, + "eval_entropy": 0.24866497918461147, + "eval_loss": 0.8349169492721558, + "eval_mean_token_accuracy": 0.8455062398080075, + "eval_num_tokens": 11106761.0, + "eval_runtime": 55.1484, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3960 + }, + { + "entropy": 0.13437952492386102, + "epoch": 9.591073582629674, + "grad_norm": 0.5746839046478271, + "learning_rate": 1.2244148098023241e-06, + "loss": 0.0719214141368866, + "mean_token_accuracy": 0.9783813208341599, + "num_tokens": 11163241.0, + "step": 3980 + }, + { + "epoch": 9.591073582629674, + "eval_entropy": 0.2487325757909357, + "eval_loss": 0.8345232009887695, + "eval_mean_token_accuracy": 0.8452854909923639, + "eval_num_tokens": 11163241.0, + "eval_runtime": 55.2029, + "eval_samples_per_second": 25.723, + "eval_steps_per_second": 3.224, + "step": 3980 + }, + { + "entropy": 0.1449177075177431, + "epoch": 9.639324487334138, + "grad_norm": 0.7029407024383545, + "learning_rate": 9.551139612183896e-07, + "loss": 0.08021060228347779, + "mean_token_accuracy": 0.9748102590441704, + "num_tokens": 11215311.0, + "step": 4000 + }, + { + "epoch": 9.639324487334138, + "eval_entropy": 0.24849342898036655, + "eval_loss": 0.8350681066513062, + "eval_mean_token_accuracy": 0.8454755872822879, + "eval_num_tokens": 11215311.0, + "eval_runtime": 55.1818, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 4000 + }, + { + "entropy": 0.13718088436871767, + "epoch": 9.687575392038601, + "grad_norm": 0.4247730076313019, + "learning_rate": 7.190984211864178e-07, + "loss": 0.0763831913471222, + "mean_token_accuracy": 0.9777001023292542, + "num_tokens": 11272587.0, + "step": 4020 + }, + { + "epoch": 9.687575392038601, + "eval_entropy": 0.2485166782241189, + "eval_loss": 0.8347740769386292, + "eval_mean_token_accuracy": 0.8455832382936156, + "eval_num_tokens": 11272587.0, + "eval_runtime": 55.1819, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 4020 + }, + { + "entropy": 0.1414831655099988, + "epoch": 9.735826296743063, + "grad_norm": 0.4344032108783722, + "learning_rate": 5.164349793124746e-07, + "loss": 0.0786937952041626, + "mean_token_accuracy": 0.9762448608875275, + "num_tokens": 11326845.0, + "step": 4040 + }, + { + "epoch": 9.735826296743063, + "eval_entropy": 0.2485147834326444, + "eval_loss": 0.8348782658576965, + "eval_mean_token_accuracy": 0.8454727480250798, + "eval_num_tokens": 11326845.0, + "eval_runtime": 55.1896, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 4040 + }, + { + "entropy": 0.1397345969453454, + "epoch": 9.784077201447527, + "grad_norm": 0.5865362882614136, + "learning_rate": 3.4718098695330847e-07, + "loss": 0.07839923501014709, + "mean_token_accuracy": 0.9766460061073303, + "num_tokens": 11381321.0, + "step": 4060 + }, + { + "epoch": 9.784077201447527, + "eval_entropy": 0.24835219778371662, + "eval_loss": 0.8349990248680115, + "eval_mean_token_accuracy": 0.8452944608216875, + "eval_num_tokens": 11381321.0, + "eval_runtime": 55.1862, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 4060 + }, + { + "entropy": 0.13877951726317406, + "epoch": 9.83232810615199, + "grad_norm": 0.36521604657173157, + "learning_rate": 2.1138434098667948e-07, + "loss": 0.0738587200641632, + "mean_token_accuracy": 0.9764896467328071, + "num_tokens": 11441968.0, + "step": 4080 + }, + { + "epoch": 9.83232810615199, + "eval_entropy": 0.24834532483240193, + "eval_loss": 0.8350111246109009, + "eval_mean_token_accuracy": 0.8456257598453694, + "eval_num_tokens": 11441968.0, + "eval_runtime": 55.1871, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 4080 + }, + { + "entropy": 0.1345276204869151, + "epoch": 9.880579010856454, + "grad_norm": 0.45626401901245117, + "learning_rate": 1.0908347025708512e-07, + "loss": 0.07468653917312622, + "mean_token_accuracy": 0.978096280992031, + "num_tokens": 11500487.0, + "step": 4100 + }, + { + "epoch": 9.880579010856454, + "eval_entropy": 0.2485178895713238, + "eval_loss": 0.834865152835846, + "eval_mean_token_accuracy": 0.8453632285085957, + "eval_num_tokens": 11500487.0, + "eval_runtime": 55.1746, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 4100 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.189029460886118e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ef6e08d25f89f60b81bcbd37ef1335478f4511b8 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/trainer_state.json @@ -0,0 +1,4360 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.928829915560916, + "eval_steps": 20, + "global_step": 4120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + }, + { + "entropy": 0.15672272052615882, + "epoch": 8.916767189384801, + "grad_norm": 0.610146701335907, + "learning_rate": 8.429930072725457e-06, + "loss": 0.09335047006607056, + "mean_token_accuracy": 0.9709506019949913, + "num_tokens": 10377086.0, + "step": 3700 + }, + { + "epoch": 8.916767189384801, + "eval_entropy": 0.26106021611878044, + "eval_loss": 0.7824530005455017, + "eval_mean_token_accuracy": 0.8467274777005228, + "eval_num_tokens": 10377086.0, + "eval_runtime": 55.1677, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3700 + }, + { + "entropy": 0.15496804118156432, + "epoch": 8.965018094089265, + "grad_norm": 0.8847843408584595, + "learning_rate": 7.706871844646178e-06, + "loss": 0.09552072882652282, + "mean_token_accuracy": 0.9693930178880692, + "num_tokens": 10435272.0, + "step": 3720 + }, + { + "epoch": 8.965018094089265, + "eval_entropy": 0.26175538701622675, + "eval_loss": 0.7833470106124878, + "eval_mean_token_accuracy": 0.8464220507761065, + "eval_num_tokens": 10435272.0, + "eval_runtime": 55.1496, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3720 + }, + { + "entropy": 0.15739625711471605, + "epoch": 9.012062726176115, + "grad_norm": 0.35802221298217773, + "learning_rate": 7.0151882575034775e-06, + "loss": 0.09389110803604125, + "mean_token_accuracy": 0.9715636464265677, + "num_tokens": 10487076.0, + "step": 3740 + }, + { + "epoch": 9.012062726176115, + "eval_entropy": 0.26084648198291155, + "eval_loss": 0.7853291034698486, + "eval_mean_token_accuracy": 0.8463665585169632, + "eval_num_tokens": 10487076.0, + "eval_runtime": 55.1572, + "eval_samples_per_second": 25.745, + "eval_steps_per_second": 3.227, + "step": 3740 + }, + { + "entropy": 0.14215838070958853, + "epoch": 9.060313630880579, + "grad_norm": 0.4684004783630371, + "learning_rate": 6.35507504957069e-06, + "loss": 0.07583575248718262, + "mean_token_accuracy": 0.9782516390085221, + "num_tokens": 10543520.0, + "step": 3760 + }, + { + "epoch": 9.060313630880579, + "eval_entropy": 0.254280548333452, + "eval_loss": 0.807111382484436, + "eval_mean_token_accuracy": 0.8463215992022096, + "eval_num_tokens": 10543520.0, + "eval_runtime": 55.1805, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3760 + }, + { + "entropy": 0.1408092312514782, + "epoch": 9.108564535585042, + "grad_norm": 0.43550431728363037, + "learning_rate": 5.726719025077231e-06, + "loss": 0.07781847715377807, + "mean_token_accuracy": 0.9768255725502968, + "num_tokens": 10596528.0, + "step": 3780 + }, + { + "epoch": 9.108564535585042, + "eval_entropy": 0.2505798229340757, + "eval_loss": 0.8224650025367737, + "eval_mean_token_accuracy": 0.8463811020502884, + "eval_num_tokens": 10596528.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3780 + }, + { + "entropy": 0.13897503707557918, + "epoch": 9.156815440289506, + "grad_norm": 0.5255228281021118, + "learning_rate": 5.130298001345343e-06, + "loss": 0.07528382539749146, + "mean_token_accuracy": 0.9768329098820686, + "num_tokens": 10655544.0, + "step": 3800 + }, + { + "epoch": 9.156815440289506, + "eval_entropy": 0.2509520294960965, + "eval_loss": 0.8247353434562683, + "eval_mean_token_accuracy": 0.845786559112956, + "eval_num_tokens": 10655544.0, + "eval_runtime": 55.151, + "eval_samples_per_second": 25.747, + "eval_steps_per_second": 3.228, + "step": 3800 + }, + { + "entropy": 0.13998530581593513, + "epoch": 9.205066344993968, + "grad_norm": 0.4610442817211151, + "learning_rate": 4.565980758469731e-06, + "loss": 0.07813523411750793, + "mean_token_accuracy": 0.9769201070070267, + "num_tokens": 10710737.0, + "step": 3820 + }, + { + "epoch": 9.205066344993968, + "eval_entropy": 0.2503350620691696, + "eval_loss": 0.8271610736846924, + "eval_mean_token_accuracy": 0.8455204879969693, + "eval_num_tokens": 10710737.0, + "eval_runtime": 55.1706, + "eval_samples_per_second": 25.738, + "eval_steps_per_second": 3.226, + "step": 3820 + }, + { + "entropy": 0.13162653651088477, + "epoch": 9.253317249698432, + "grad_norm": 0.3627360463142395, + "learning_rate": 4.033926991554922e-06, + "loss": 0.07141604423522949, + "mean_token_accuracy": 0.9792275875806808, + "num_tokens": 10771727.0, + "step": 3840 + }, + { + "epoch": 9.253317249698432, + "eval_entropy": 0.24897396087311627, + "eval_loss": 0.8307036757469177, + "eval_mean_token_accuracy": 0.8453606835911783, + "eval_num_tokens": 10771727.0, + "eval_runtime": 55.168, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3840 + }, + { + "entropy": 0.14422765467315912, + "epoch": 9.301568154402895, + "grad_norm": 0.5706251263618469, + "learning_rate": 3.53428726552335e-06, + "loss": 0.07931464314460754, + "mean_token_accuracy": 0.9752222061157226, + "num_tokens": 10825849.0, + "step": 3860 + }, + { + "epoch": 9.301568154402895, + "eval_entropy": 0.24914598389622872, + "eval_loss": 0.8312752842903137, + "eval_mean_token_accuracy": 0.8455510604917333, + "eval_num_tokens": 10825849.0, + "eval_runtime": 55.1849, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3860 + }, + { + "entropy": 0.13540438804775476, + "epoch": 9.349819059107359, + "grad_norm": 0.6334630250930786, + "learning_rate": 3.0672029725073196e-06, + "loss": 0.07548695802688599, + "mean_token_accuracy": 0.9778543844819069, + "num_tokens": 10884611.0, + "step": 3880 + }, + { + "epoch": 9.349819059107359, + "eval_entropy": 0.24910795722114906, + "eval_loss": 0.8316059112548828, + "eval_mean_token_accuracy": 0.8456973557391864, + "eval_num_tokens": 10884611.0, + "eval_runtime": 55.1728, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3880 + }, + { + "entropy": 0.1410717975348234, + "epoch": 9.398069963811821, + "grad_norm": 0.5009909868240356, + "learning_rate": 2.632806291836666e-06, + "loss": 0.07720760703086853, + "mean_token_accuracy": 0.9768648758530617, + "num_tokens": 10938300.0, + "step": 3900 + }, + { + "epoch": 9.398069963811821, + "eval_entropy": 0.24874219742048992, + "eval_loss": 0.8328408598899841, + "eval_mean_token_accuracy": 0.8455627888775943, + "eval_num_tokens": 10938300.0, + "eval_runtime": 55.1736, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3900 + }, + { + "entropy": 0.13674762714654207, + "epoch": 9.446320868516285, + "grad_norm": 0.46003177762031555, + "learning_rate": 2.231220152633621e-06, + "loss": 0.07583877444267273, + "mean_token_accuracy": 0.9772356480360032, + "num_tokens": 10994442.0, + "step": 3920 + }, + { + "epoch": 9.446320868516285, + "eval_entropy": 0.24896439030933915, + "eval_loss": 0.8331801891326904, + "eval_mean_token_accuracy": 0.8453999262177543, + "eval_num_tokens": 10994442.0, + "eval_runtime": 55.1815, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3920 + }, + { + "entropy": 0.1378044320270419, + "epoch": 9.494571773220748, + "grad_norm": 0.5056300759315491, + "learning_rate": 1.862558199025263e-06, + "loss": 0.07420622110366822, + "mean_token_accuracy": 0.9771117404103279, + "num_tokens": 11052708.0, + "step": 3940 + }, + { + "epoch": 9.494571773220748, + "eval_entropy": 0.2490867761413703, + "eval_loss": 0.8338391780853271, + "eval_mean_token_accuracy": 0.8453461571355884, + "eval_num_tokens": 11052708.0, + "eval_runtime": 59.1613, + "eval_samples_per_second": 24.002, + "eval_steps_per_second": 3.009, + "step": 3940 + }, + { + "entropy": 0.13731490727514029, + "epoch": 9.54282267792521, + "grad_norm": 0.5770965218544006, + "learning_rate": 1.5269247579836162e-06, + "loss": 0.07547505497932434, + "mean_token_accuracy": 0.9772329092025757, + "num_tokens": 11106761.0, + "step": 3960 + }, + { + "epoch": 9.54282267792521, + "eval_entropy": 0.24866497918461147, + "eval_loss": 0.8349169492721558, + "eval_mean_token_accuracy": 0.8455062398080075, + "eval_num_tokens": 11106761.0, + "eval_runtime": 55.1484, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3960 + }, + { + "entropy": 0.13437952492386102, + "epoch": 9.591073582629674, + "grad_norm": 0.5746839046478271, + "learning_rate": 1.2244148098023241e-06, + "loss": 0.0719214141368866, + "mean_token_accuracy": 0.9783813208341599, + "num_tokens": 11163241.0, + "step": 3980 + }, + { + "epoch": 9.591073582629674, + "eval_entropy": 0.2487325757909357, + "eval_loss": 0.8345232009887695, + "eval_mean_token_accuracy": 0.8452854909923639, + "eval_num_tokens": 11163241.0, + "eval_runtime": 55.2029, + "eval_samples_per_second": 25.723, + "eval_steps_per_second": 3.224, + "step": 3980 + }, + { + "entropy": 0.1449177075177431, + "epoch": 9.639324487334138, + "grad_norm": 0.7029407024383545, + "learning_rate": 9.551139612183896e-07, + "loss": 0.08021060228347779, + "mean_token_accuracy": 0.9748102590441704, + "num_tokens": 11215311.0, + "step": 4000 + }, + { + "epoch": 9.639324487334138, + "eval_entropy": 0.24849342898036655, + "eval_loss": 0.8350681066513062, + "eval_mean_token_accuracy": 0.8454755872822879, + "eval_num_tokens": 11215311.0, + "eval_runtime": 55.1818, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 4000 + }, + { + "entropy": 0.13718088436871767, + "epoch": 9.687575392038601, + "grad_norm": 0.4247730076313019, + "learning_rate": 7.190984211864178e-07, + "loss": 0.0763831913471222, + "mean_token_accuracy": 0.9777001023292542, + "num_tokens": 11272587.0, + "step": 4020 + }, + { + "epoch": 9.687575392038601, + "eval_entropy": 0.2485166782241189, + "eval_loss": 0.8347740769386292, + "eval_mean_token_accuracy": 0.8455832382936156, + "eval_num_tokens": 11272587.0, + "eval_runtime": 55.1819, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 4020 + }, + { + "entropy": 0.1414831655099988, + "epoch": 9.735826296743063, + "grad_norm": 0.4344032108783722, + "learning_rate": 5.164349793124746e-07, + "loss": 0.0786937952041626, + "mean_token_accuracy": 0.9762448608875275, + "num_tokens": 11326845.0, + "step": 4040 + }, + { + "epoch": 9.735826296743063, + "eval_entropy": 0.2485147834326444, + "eval_loss": 0.8348782658576965, + "eval_mean_token_accuracy": 0.8454727480250798, + "eval_num_tokens": 11326845.0, + "eval_runtime": 55.1896, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 4040 + }, + { + "entropy": 0.1397345969453454, + "epoch": 9.784077201447527, + "grad_norm": 0.5865362882614136, + "learning_rate": 3.4718098695330847e-07, + "loss": 0.07839923501014709, + "mean_token_accuracy": 0.9766460061073303, + "num_tokens": 11381321.0, + "step": 4060 + }, + { + "epoch": 9.784077201447527, + "eval_entropy": 0.24835219778371662, + "eval_loss": 0.8349990248680115, + "eval_mean_token_accuracy": 0.8452944608216875, + "eval_num_tokens": 11381321.0, + "eval_runtime": 55.1862, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 4060 + }, + { + "entropy": 0.13877951726317406, + "epoch": 9.83232810615199, + "grad_norm": 0.36521604657173157, + "learning_rate": 2.1138434098667948e-07, + "loss": 0.0738587200641632, + "mean_token_accuracy": 0.9764896467328071, + "num_tokens": 11441968.0, + "step": 4080 + }, + { + "epoch": 9.83232810615199, + "eval_entropy": 0.24834532483240193, + "eval_loss": 0.8350111246109009, + "eval_mean_token_accuracy": 0.8456257598453694, + "eval_num_tokens": 11441968.0, + "eval_runtime": 55.1871, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 4080 + }, + { + "entropy": 0.1345276204869151, + "epoch": 9.880579010856454, + "grad_norm": 0.45626401901245117, + "learning_rate": 1.0908347025708512e-07, + "loss": 0.07468653917312622, + "mean_token_accuracy": 0.978096280992031, + "num_tokens": 11500487.0, + "step": 4100 + }, + { + "epoch": 9.880579010856454, + "eval_entropy": 0.2485178895713238, + "eval_loss": 0.834865152835846, + "eval_mean_token_accuracy": 0.8453632285085957, + "eval_num_tokens": 11500487.0, + "eval_runtime": 55.1746, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 4100 + }, + { + "entropy": 0.1314420524984598, + "epoch": 9.928829915560916, + "grad_norm": 0.5756514072418213, + "learning_rate": 4.0307324700819896e-08, + "loss": 0.07114983201026917, + "mean_token_accuracy": 0.9784522473812103, + "num_tokens": 11562246.0, + "step": 4120 + }, + { + "epoch": 9.928829915560916, + "eval_entropy": 0.24849412364236426, + "eval_loss": 0.8347920775413513, + "eval_mean_token_accuracy": 0.8454210158814205, + "eval_num_tokens": 11562246.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 4120 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.222512323160678e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bf3f09a807315c9f9fb1ad28b7112a9c146a267d --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/trainer_state.json @@ -0,0 +1,4381 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.97708082026538, + "eval_steps": 20, + "global_step": 4140, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + }, + { + "entropy": 0.15672272052615882, + "epoch": 8.916767189384801, + "grad_norm": 0.610146701335907, + "learning_rate": 8.429930072725457e-06, + "loss": 0.09335047006607056, + "mean_token_accuracy": 0.9709506019949913, + "num_tokens": 10377086.0, + "step": 3700 + }, + { + "epoch": 8.916767189384801, + "eval_entropy": 0.26106021611878044, + "eval_loss": 0.7824530005455017, + "eval_mean_token_accuracy": 0.8467274777005228, + "eval_num_tokens": 10377086.0, + "eval_runtime": 55.1677, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3700 + }, + { + "entropy": 0.15496804118156432, + "epoch": 8.965018094089265, + "grad_norm": 0.8847843408584595, + "learning_rate": 7.706871844646178e-06, + "loss": 0.09552072882652282, + "mean_token_accuracy": 0.9693930178880692, + "num_tokens": 10435272.0, + "step": 3720 + }, + { + "epoch": 8.965018094089265, + "eval_entropy": 0.26175538701622675, + "eval_loss": 0.7833470106124878, + "eval_mean_token_accuracy": 0.8464220507761065, + "eval_num_tokens": 10435272.0, + "eval_runtime": 55.1496, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3720 + }, + { + "entropy": 0.15739625711471605, + "epoch": 9.012062726176115, + "grad_norm": 0.35802221298217773, + "learning_rate": 7.0151882575034775e-06, + "loss": 0.09389110803604125, + "mean_token_accuracy": 0.9715636464265677, + "num_tokens": 10487076.0, + "step": 3740 + }, + { + "epoch": 9.012062726176115, + "eval_entropy": 0.26084648198291155, + "eval_loss": 0.7853291034698486, + "eval_mean_token_accuracy": 0.8463665585169632, + "eval_num_tokens": 10487076.0, + "eval_runtime": 55.1572, + "eval_samples_per_second": 25.745, + "eval_steps_per_second": 3.227, + "step": 3740 + }, + { + "entropy": 0.14215838070958853, + "epoch": 9.060313630880579, + "grad_norm": 0.4684004783630371, + "learning_rate": 6.35507504957069e-06, + "loss": 0.07583575248718262, + "mean_token_accuracy": 0.9782516390085221, + "num_tokens": 10543520.0, + "step": 3760 + }, + { + "epoch": 9.060313630880579, + "eval_entropy": 0.254280548333452, + "eval_loss": 0.807111382484436, + "eval_mean_token_accuracy": 0.8463215992022096, + "eval_num_tokens": 10543520.0, + "eval_runtime": 55.1805, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3760 + }, + { + "entropy": 0.1408092312514782, + "epoch": 9.108564535585042, + "grad_norm": 0.43550431728363037, + "learning_rate": 5.726719025077231e-06, + "loss": 0.07781847715377807, + "mean_token_accuracy": 0.9768255725502968, + "num_tokens": 10596528.0, + "step": 3780 + }, + { + "epoch": 9.108564535585042, + "eval_entropy": 0.2505798229340757, + "eval_loss": 0.8224650025367737, + "eval_mean_token_accuracy": 0.8463811020502884, + "eval_num_tokens": 10596528.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3780 + }, + { + "entropy": 0.13897503707557918, + "epoch": 9.156815440289506, + "grad_norm": 0.5255228281021118, + "learning_rate": 5.130298001345343e-06, + "loss": 0.07528382539749146, + "mean_token_accuracy": 0.9768329098820686, + "num_tokens": 10655544.0, + "step": 3800 + }, + { + "epoch": 9.156815440289506, + "eval_entropy": 0.2509520294960965, + "eval_loss": 0.8247353434562683, + "eval_mean_token_accuracy": 0.845786559112956, + "eval_num_tokens": 10655544.0, + "eval_runtime": 55.151, + "eval_samples_per_second": 25.747, + "eval_steps_per_second": 3.228, + "step": 3800 + }, + { + "entropy": 0.13998530581593513, + "epoch": 9.205066344993968, + "grad_norm": 0.4610442817211151, + "learning_rate": 4.565980758469731e-06, + "loss": 0.07813523411750793, + "mean_token_accuracy": 0.9769201070070267, + "num_tokens": 10710737.0, + "step": 3820 + }, + { + "epoch": 9.205066344993968, + "eval_entropy": 0.2503350620691696, + "eval_loss": 0.8271610736846924, + "eval_mean_token_accuracy": 0.8455204879969693, + "eval_num_tokens": 10710737.0, + "eval_runtime": 55.1706, + "eval_samples_per_second": 25.738, + "eval_steps_per_second": 3.226, + "step": 3820 + }, + { + "entropy": 0.13162653651088477, + "epoch": 9.253317249698432, + "grad_norm": 0.3627360463142395, + "learning_rate": 4.033926991554922e-06, + "loss": 0.07141604423522949, + "mean_token_accuracy": 0.9792275875806808, + "num_tokens": 10771727.0, + "step": 3840 + }, + { + "epoch": 9.253317249698432, + "eval_entropy": 0.24897396087311627, + "eval_loss": 0.8307036757469177, + "eval_mean_token_accuracy": 0.8453606835911783, + "eval_num_tokens": 10771727.0, + "eval_runtime": 55.168, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3840 + }, + { + "entropy": 0.14422765467315912, + "epoch": 9.301568154402895, + "grad_norm": 0.5706251263618469, + "learning_rate": 3.53428726552335e-06, + "loss": 0.07931464314460754, + "mean_token_accuracy": 0.9752222061157226, + "num_tokens": 10825849.0, + "step": 3860 + }, + { + "epoch": 9.301568154402895, + "eval_entropy": 0.24914598389622872, + "eval_loss": 0.8312752842903137, + "eval_mean_token_accuracy": 0.8455510604917333, + "eval_num_tokens": 10825849.0, + "eval_runtime": 55.1849, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3860 + }, + { + "entropy": 0.13540438804775476, + "epoch": 9.349819059107359, + "grad_norm": 0.6334630250930786, + "learning_rate": 3.0672029725073196e-06, + "loss": 0.07548695802688599, + "mean_token_accuracy": 0.9778543844819069, + "num_tokens": 10884611.0, + "step": 3880 + }, + { + "epoch": 9.349819059107359, + "eval_entropy": 0.24910795722114906, + "eval_loss": 0.8316059112548828, + "eval_mean_token_accuracy": 0.8456973557391864, + "eval_num_tokens": 10884611.0, + "eval_runtime": 55.1728, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3880 + }, + { + "entropy": 0.1410717975348234, + "epoch": 9.398069963811821, + "grad_norm": 0.5009909868240356, + "learning_rate": 2.632806291836666e-06, + "loss": 0.07720760703086853, + "mean_token_accuracy": 0.9768648758530617, + "num_tokens": 10938300.0, + "step": 3900 + }, + { + "epoch": 9.398069963811821, + "eval_entropy": 0.24874219742048992, + "eval_loss": 0.8328408598899841, + "eval_mean_token_accuracy": 0.8455627888775943, + "eval_num_tokens": 10938300.0, + "eval_runtime": 55.1736, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3900 + }, + { + "entropy": 0.13674762714654207, + "epoch": 9.446320868516285, + "grad_norm": 0.46003177762031555, + "learning_rate": 2.231220152633621e-06, + "loss": 0.07583877444267273, + "mean_token_accuracy": 0.9772356480360032, + "num_tokens": 10994442.0, + "step": 3920 + }, + { + "epoch": 9.446320868516285, + "eval_entropy": 0.24896439030933915, + "eval_loss": 0.8331801891326904, + "eval_mean_token_accuracy": 0.8453999262177543, + "eval_num_tokens": 10994442.0, + "eval_runtime": 55.1815, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3920 + }, + { + "entropy": 0.1378044320270419, + "epoch": 9.494571773220748, + "grad_norm": 0.5056300759315491, + "learning_rate": 1.862558199025263e-06, + "loss": 0.07420622110366822, + "mean_token_accuracy": 0.9771117404103279, + "num_tokens": 11052708.0, + "step": 3940 + }, + { + "epoch": 9.494571773220748, + "eval_entropy": 0.2490867761413703, + "eval_loss": 0.8338391780853271, + "eval_mean_token_accuracy": 0.8453461571355884, + "eval_num_tokens": 11052708.0, + "eval_runtime": 59.1613, + "eval_samples_per_second": 24.002, + "eval_steps_per_second": 3.009, + "step": 3940 + }, + { + "entropy": 0.13731490727514029, + "epoch": 9.54282267792521, + "grad_norm": 0.5770965218544006, + "learning_rate": 1.5269247579836162e-06, + "loss": 0.07547505497932434, + "mean_token_accuracy": 0.9772329092025757, + "num_tokens": 11106761.0, + "step": 3960 + }, + { + "epoch": 9.54282267792521, + "eval_entropy": 0.24866497918461147, + "eval_loss": 0.8349169492721558, + "eval_mean_token_accuracy": 0.8455062398080075, + "eval_num_tokens": 11106761.0, + "eval_runtime": 55.1484, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3960 + }, + { + "entropy": 0.13437952492386102, + "epoch": 9.591073582629674, + "grad_norm": 0.5746839046478271, + "learning_rate": 1.2244148098023241e-06, + "loss": 0.0719214141368866, + "mean_token_accuracy": 0.9783813208341599, + "num_tokens": 11163241.0, + "step": 3980 + }, + { + "epoch": 9.591073582629674, + "eval_entropy": 0.2487325757909357, + "eval_loss": 0.8345232009887695, + "eval_mean_token_accuracy": 0.8452854909923639, + "eval_num_tokens": 11163241.0, + "eval_runtime": 55.2029, + "eval_samples_per_second": 25.723, + "eval_steps_per_second": 3.224, + "step": 3980 + }, + { + "entropy": 0.1449177075177431, + "epoch": 9.639324487334138, + "grad_norm": 0.7029407024383545, + "learning_rate": 9.551139612183896e-07, + "loss": 0.08021060228347779, + "mean_token_accuracy": 0.9748102590441704, + "num_tokens": 11215311.0, + "step": 4000 + }, + { + "epoch": 9.639324487334138, + "eval_entropy": 0.24849342898036655, + "eval_loss": 0.8350681066513062, + "eval_mean_token_accuracy": 0.8454755872822879, + "eval_num_tokens": 11215311.0, + "eval_runtime": 55.1818, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 4000 + }, + { + "entropy": 0.13718088436871767, + "epoch": 9.687575392038601, + "grad_norm": 0.4247730076313019, + "learning_rate": 7.190984211864178e-07, + "loss": 0.0763831913471222, + "mean_token_accuracy": 0.9777001023292542, + "num_tokens": 11272587.0, + "step": 4020 + }, + { + "epoch": 9.687575392038601, + "eval_entropy": 0.2485166782241189, + "eval_loss": 0.8347740769386292, + "eval_mean_token_accuracy": 0.8455832382936156, + "eval_num_tokens": 11272587.0, + "eval_runtime": 55.1819, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 4020 + }, + { + "entropy": 0.1414831655099988, + "epoch": 9.735826296743063, + "grad_norm": 0.4344032108783722, + "learning_rate": 5.164349793124746e-07, + "loss": 0.0786937952041626, + "mean_token_accuracy": 0.9762448608875275, + "num_tokens": 11326845.0, + "step": 4040 + }, + { + "epoch": 9.735826296743063, + "eval_entropy": 0.2485147834326444, + "eval_loss": 0.8348782658576965, + "eval_mean_token_accuracy": 0.8454727480250798, + "eval_num_tokens": 11326845.0, + "eval_runtime": 55.1896, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 4040 + }, + { + "entropy": 0.1397345969453454, + "epoch": 9.784077201447527, + "grad_norm": 0.5865362882614136, + "learning_rate": 3.4718098695330847e-07, + "loss": 0.07839923501014709, + "mean_token_accuracy": 0.9766460061073303, + "num_tokens": 11381321.0, + "step": 4060 + }, + { + "epoch": 9.784077201447527, + "eval_entropy": 0.24835219778371662, + "eval_loss": 0.8349990248680115, + "eval_mean_token_accuracy": 0.8452944608216875, + "eval_num_tokens": 11381321.0, + "eval_runtime": 55.1862, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 4060 + }, + { + "entropy": 0.13877951726317406, + "epoch": 9.83232810615199, + "grad_norm": 0.36521604657173157, + "learning_rate": 2.1138434098667948e-07, + "loss": 0.0738587200641632, + "mean_token_accuracy": 0.9764896467328071, + "num_tokens": 11441968.0, + "step": 4080 + }, + { + "epoch": 9.83232810615199, + "eval_entropy": 0.24834532483240193, + "eval_loss": 0.8350111246109009, + "eval_mean_token_accuracy": 0.8456257598453694, + "eval_num_tokens": 11441968.0, + "eval_runtime": 55.1871, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 4080 + }, + { + "entropy": 0.1345276204869151, + "epoch": 9.880579010856454, + "grad_norm": 0.45626401901245117, + "learning_rate": 1.0908347025708512e-07, + "loss": 0.07468653917312622, + "mean_token_accuracy": 0.978096280992031, + "num_tokens": 11500487.0, + "step": 4100 + }, + { + "epoch": 9.880579010856454, + "eval_entropy": 0.2485178895713238, + "eval_loss": 0.834865152835846, + "eval_mean_token_accuracy": 0.8453632285085957, + "eval_num_tokens": 11500487.0, + "eval_runtime": 55.1746, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 4100 + }, + { + "entropy": 0.1314420524984598, + "epoch": 9.928829915560916, + "grad_norm": 0.5756514072418213, + "learning_rate": 4.0307324700819896e-08, + "loss": 0.07114983201026917, + "mean_token_accuracy": 0.9784522473812103, + "num_tokens": 11562246.0, + "step": 4120 + }, + { + "epoch": 9.928829915560916, + "eval_entropy": 0.24849412364236426, + "eval_loss": 0.8347920775413513, + "eval_mean_token_accuracy": 0.8454210158814205, + "eval_num_tokens": 11562246.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 4120 + }, + { + "entropy": 0.14091254398226738, + "epoch": 9.97708082026538, + "grad_norm": 0.4619421064853668, + "learning_rate": 5.075367153567275e-09, + "loss": 0.07807959914207459, + "mean_token_accuracy": 0.9760556846857071, + "num_tokens": 11614714.0, + "step": 4140 + }, + { + "epoch": 9.97708082026538, + "eval_entropy": 0.24850971368926295, + "eval_loss": 0.8348681926727295, + "eval_mean_token_accuracy": 0.8453842609116201, + "eval_num_tokens": 11614714.0, + "eval_runtime": 55.1689, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 4140 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.24999906917929e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..85d7ccdc7a5e89ba20ea4d1c3cc1f99a77dc3116 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/trainer_state.json @@ -0,0 +1,4392 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 20, + "global_step": 4150, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + }, + { + "entropy": 0.5248019628226757, + "epoch": 2.4101326899879374, + "grad_norm": 0.4942329525947571, + "learning_rate": 0.00022313151729919296, + "loss": 0.4856869220733643, + "mean_token_accuracy": 0.8571616068482399, + "num_tokens": 2808227.0, + "step": 1000 + }, + { + "epoch": 2.4101326899879374, + "eval_entropy": 0.5513194481308541, + "eval_loss": 0.5423293709754944, + "eval_mean_token_accuracy": 0.8457836836911319, + "eval_num_tokens": 2808227.0, + "eval_runtime": 55.2589, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1000 + }, + { + "entropy": 0.5259068854153156, + "epoch": 2.4583835946924006, + "grad_norm": 0.36930060386657715, + "learning_rate": 0.00022217585350392177, + "loss": 0.4831561088562012, + "mean_token_accuracy": 0.8585615202784538, + "num_tokens": 2867952.0, + "step": 1020 + }, + { + "epoch": 2.4583835946924006, + "eval_entropy": 0.5478445388627856, + "eval_loss": 0.5382154583930969, + "eval_mean_token_accuracy": 0.8472998199168216, + "eval_num_tokens": 2867952.0, + "eval_runtime": 55.2641, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1020 + }, + { + "entropy": 0.5247037045657634, + "epoch": 2.5066344993968634, + "grad_norm": 0.4037776291370392, + "learning_rate": 0.00022119087216470113, + "loss": 0.4769702434539795, + "mean_token_accuracy": 0.8580174028873444, + "num_tokens": 2921659.0, + "step": 1040 + }, + { + "epoch": 2.5066344993968634, + "eval_entropy": 0.5579622049679916, + "eval_loss": 0.537755012512207, + "eval_mean_token_accuracy": 0.847535682863064, + "eval_num_tokens": 2921659.0, + "eval_runtime": 55.2532, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 1040 + }, + { + "entropy": 0.5346525005996228, + "epoch": 2.554885404101327, + "grad_norm": 0.4666038453578949, + "learning_rate": 0.00022017685201959885, + "loss": 0.4858390331268311, + "mean_token_accuracy": 0.8568937763571739, + "num_tokens": 2976634.0, + "step": 1060 + }, + { + "epoch": 2.554885404101327, + "eval_entropy": 0.5403439740786392, + "eval_loss": 0.5404531359672546, + "eval_mean_token_accuracy": 0.847268155451571, + "eval_num_tokens": 2976634.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 1060 + }, + { + "entropy": 0.52818808183074, + "epoch": 2.60313630880579, + "grad_norm": 0.46565404534339905, + "learning_rate": 0.00021913408002432124, + "loss": 0.48402113914489747, + "mean_token_accuracy": 0.8563135221600533, + "num_tokens": 3033832.0, + "step": 1080 + }, + { + "epoch": 2.60313630880579, + "eval_entropy": 0.5540322053633379, + "eval_loss": 0.5376110076904297, + "eval_mean_token_accuracy": 0.8479098319337609, + "eval_num_tokens": 3033832.0, + "eval_runtime": 55.2609, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1080 + }, + { + "entropy": 0.5257870592176914, + "epoch": 2.651387213510253, + "grad_norm": 0.42831000685691833, + "learning_rate": 0.00021806285127100823, + "loss": 0.48136534690856936, + "mean_token_accuracy": 0.8569760799407959, + "num_tokens": 3091618.0, + "step": 1100 + }, + { + "epoch": 2.651387213510253, + "eval_entropy": 0.5380372698052546, + "eval_loss": 0.5353341698646545, + "eval_mean_token_accuracy": 0.8481398174601994, + "eval_num_tokens": 3091618.0, + "eval_runtime": 55.2823, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 1100 + }, + { + "entropy": 0.5353380210697651, + "epoch": 2.6996381182147164, + "grad_norm": 0.42232778668403625, + "learning_rate": 0.00021696346890472552, + "loss": 0.49104962348937986, + "mean_token_accuracy": 0.8557631894946098, + "num_tokens": 3145557.0, + "step": 1120 + }, + { + "epoch": 2.6996381182147164, + "eval_entropy": 0.5383762990155917, + "eval_loss": 0.5312851071357727, + "eval_mean_token_accuracy": 0.8492028927535153, + "eval_num_tokens": 3145557.0, + "eval_runtime": 55.2845, + "eval_samples_per_second": 25.685, + "eval_steps_per_second": 3.22, + "step": 1120 + }, + { + "entropy": 0.5406845368444919, + "epoch": 2.7478890229191797, + "grad_norm": 0.4468596577644348, + "learning_rate": 0.0002158362440376784, + "loss": 0.49189152717590334, + "mean_token_accuracy": 0.8547335088253021, + "num_tokens": 3197564.0, + "step": 1140 + }, + { + "epoch": 2.7478890229191797, + "eval_entropy": 0.5433913787429252, + "eval_loss": 0.530707597732544, + "eval_mean_token_accuracy": 0.8490624461281165, + "eval_num_tokens": 3197564.0, + "eval_runtime": 55.287, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1140 + }, + { + "entropy": 0.5258526459336281, + "epoch": 2.796139927623643, + "grad_norm": 0.391728937625885, + "learning_rate": 0.0002146814956611704, + "loss": 0.4771277904510498, + "mean_token_accuracy": 0.8589153334498405, + "num_tokens": 3254632.0, + "step": 1160 + }, + { + "epoch": 2.796139927623643, + "eval_entropy": 0.5291422690903202, + "eval_loss": 0.5308067798614502, + "eval_mean_token_accuracy": 0.8499355989225795, + "eval_num_tokens": 3254632.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1160 + }, + { + "entropy": 0.5414377674460411, + "epoch": 2.844390832328106, + "grad_norm": 0.47752824425697327, + "learning_rate": 0.0002134995505553327, + "loss": 0.4902364730834961, + "mean_token_accuracy": 0.8546424314379693, + "num_tokens": 3309169.0, + "step": 1180 + }, + { + "epoch": 2.844390832328106, + "eval_entropy": 0.5259190955188837, + "eval_loss": 0.5349776744842529, + "eval_mean_token_accuracy": 0.8489457506142305, + "eval_num_tokens": 3309169.0, + "eval_runtime": 55.2731, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1180 + }, + { + "entropy": 0.5215715534985066, + "epoch": 2.8926417370325694, + "grad_norm": 0.41028308868408203, + "learning_rate": 0.00021229074319664928, + "loss": 0.4762150287628174, + "mean_token_accuracy": 0.8578165486454964, + "num_tokens": 3365585.0, + "step": 1200 + }, + { + "epoch": 2.8926417370325694, + "eval_entropy": 0.5482661454530244, + "eval_loss": 0.5323300957679749, + "eval_mean_token_accuracy": 0.848629221487581, + "eval_num_tokens": 3365585.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 1200 + }, + { + "entropy": 0.5206520460546017, + "epoch": 2.9408926417370327, + "grad_norm": 0.5318732857704163, + "learning_rate": 0.00021105541566330375, + "loss": 0.4734694480895996, + "mean_token_accuracy": 0.8595159903168679, + "num_tokens": 3424180.0, + "step": 1220 + }, + { + "epoch": 2.9408926417370327, + "eval_entropy": 0.5315383620811313, + "eval_loss": 0.5288159251213074, + "eval_mean_token_accuracy": 0.8504117121187489, + "eval_num_tokens": 3424180.0, + "eval_runtime": 55.2871, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1220 + }, + { + "entropy": 0.5353466637432576, + "epoch": 2.989143546441496, + "grad_norm": 0.37469980120658875, + "learning_rate": 0.00020979391753837555, + "loss": 0.48825845718383787, + "mean_token_accuracy": 0.8551038816571236, + "num_tokens": 3478101.0, + "step": 1240 + }, + { + "epoch": 2.989143546441496, + "eval_entropy": 0.545977213074652, + "eval_loss": 0.5272343754768372, + "eval_mean_token_accuracy": 0.8505403610427728, + "eval_num_tokens": 3478101.0, + "eval_runtime": 55.2873, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1240 + }, + { + "entropy": 0.4871322696025555, + "epoch": 3.0361881785283473, + "grad_norm": 0.43063971400260925, + "learning_rate": 0.00020850660581091197, + "loss": 0.4392428398132324, + "mean_token_accuracy": 0.8684728237298819, + "num_tokens": 3533413.0, + "step": 1260 + }, + { + "epoch": 3.0361881785283473, + "eval_entropy": 0.47941737563422554, + "eval_loss": 0.5416561961174011, + "eval_mean_token_accuracy": 0.8500779602634773, + "eval_num_tokens": 3533413.0, + "eval_runtime": 55.2765, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1260 + }, + { + "entropy": 0.4550061285495758, + "epoch": 3.0844390832328106, + "grad_norm": 0.4342574179172516, + "learning_rate": 0.00020719384477490443, + "loss": 0.4091750144958496, + "mean_token_accuracy": 0.8743084371089935, + "num_tokens": 3594172.0, + "step": 1280 + }, + { + "epoch": 3.0844390832328106, + "eval_entropy": 0.5002224577611751, + "eval_loss": 0.5408567786216736, + "eval_mean_token_accuracy": 0.8497345280111506, + "eval_num_tokens": 3594172.0, + "eval_runtime": 55.278, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1280 + }, + { + "entropy": 0.47481488808989525, + "epoch": 3.132689987937274, + "grad_norm": 0.4471692740917206, + "learning_rate": 0.00020585600592619766, + "loss": 0.42618322372436523, + "mean_token_accuracy": 0.8696768507361412, + "num_tokens": 3654603.0, + "step": 1300 + }, + { + "epoch": 3.132689987937274, + "eval_entropy": 0.4904685912842161, + "eval_loss": 0.5353798866271973, + "eval_mean_token_accuracy": 0.8513298855068978, + "eval_num_tokens": 3654603.0, + "eval_runtime": 55.2946, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 1300 + }, + { + "entropy": 0.4613390527665615, + "epoch": 3.180940892641737, + "grad_norm": 0.435248464345932, + "learning_rate": 0.00020449346785736077, + "loss": 0.4190497398376465, + "mean_token_accuracy": 0.8722693488001824, + "num_tokens": 3718302.0, + "step": 1320 + }, + { + "epoch": 3.180940892641737, + "eval_entropy": 0.49038933937469226, + "eval_loss": 0.5339825749397278, + "eval_mean_token_accuracy": 0.8512222123949715, + "eval_num_tokens": 3718302.0, + "eval_runtime": 55.2777, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1320 + }, + { + "entropy": 0.46275881826877596, + "epoch": 3.2291917973462003, + "grad_norm": 0.5253846049308777, + "learning_rate": 0.00020310661615054987, + "loss": 0.4195539474487305, + "mean_token_accuracy": 0.8733018428087235, + "num_tokens": 3773964.0, + "step": 1340 + }, + { + "epoch": 3.2291917973462003, + "eval_entropy": 0.4907550284366929, + "eval_loss": 0.5359752178192139, + "eval_mean_token_accuracy": 0.8519132944305291, + "eval_num_tokens": 3773964.0, + "eval_runtime": 55.2579, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1340 + }, + { + "entropy": 0.478652510792017, + "epoch": 3.2774427020506636, + "grad_norm": 0.4682963788509369, + "learning_rate": 0.00020169584326839324, + "loss": 0.4342951774597168, + "mean_token_accuracy": 0.8700524374842644, + "num_tokens": 3831303.0, + "step": 1360 + }, + { + "epoch": 3.2774427020506636, + "eval_entropy": 0.5014148172032967, + "eval_loss": 0.5317310094833374, + "eval_mean_token_accuracy": 0.8515536684668465, + "eval_num_tokens": 3831303.0, + "eval_runtime": 55.2727, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1360 + }, + { + "entropy": 0.48046978786587713, + "epoch": 3.325693606755127, + "grad_norm": 0.5463082194328308, + "learning_rate": 0.0002002615484429286, + "loss": 0.4319463729858398, + "mean_token_accuracy": 0.866960471868515, + "num_tokens": 3883792.0, + "step": 1380 + }, + { + "epoch": 3.325693606755127, + "eval_entropy": 0.4927057652326112, + "eval_loss": 0.534314751625061, + "eval_mean_token_accuracy": 0.8516227616352982, + "eval_num_tokens": 3883792.0, + "eval_runtime": 55.292, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 1380 + }, + { + "entropy": 0.4725102625787258, + "epoch": 3.37394451145959, + "grad_norm": 0.5451153516769409, + "learning_rate": 0.00019880413756262559, + "loss": 0.42047967910766604, + "mean_token_accuracy": 0.8709015130996705, + "num_tokens": 3938525.0, + "step": 1400 + }, + { + "epoch": 3.37394451145959, + "eval_entropy": 0.4727198945337467, + "eval_loss": 0.5405450463294983, + "eval_mean_token_accuracy": 0.8502951676256201, + "eval_num_tokens": 3938525.0, + "eval_runtime": 55.2551, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 1400 + }, + { + "entropy": 0.46446300894021986, + "epoch": 3.422195416164053, + "grad_norm": 0.5143592357635498, + "learning_rate": 0.00019732402305752366, + "loss": 0.41742353439331054, + "mean_token_accuracy": 0.8726874738931656, + "num_tokens": 3992540.0, + "step": 1420 + }, + { + "epoch": 3.422195416164053, + "eval_entropy": 0.4988140174177256, + "eval_loss": 0.531399130821228, + "eval_mean_token_accuracy": 0.8521023335751523, + "eval_num_tokens": 3992540.0, + "eval_runtime": 55.2501, + "eval_samples_per_second": 25.701, + "eval_steps_per_second": 3.222, + "step": 1420 + }, + { + "entropy": 0.46348607912659645, + "epoch": 3.470446320868516, + "grad_norm": 0.4538668990135193, + "learning_rate": 0.00019582162378251983, + "loss": 0.41525859832763673, + "mean_token_accuracy": 0.8723404765129089, + "num_tokens": 4046628.0, + "step": 1440 + }, + { + "epoch": 3.470446320868516, + "eval_entropy": 0.49713132438364993, + "eval_loss": 0.5303418040275574, + "eval_mean_token_accuracy": 0.8517764990919092, + "eval_num_tokens": 4046628.0, + "eval_runtime": 55.2627, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 1440 + }, + { + "entropy": 0.4754491910338402, + "epoch": 3.5186972255729794, + "grad_norm": 0.5917999744415283, + "learning_rate": 0.00019429736489883723, + "loss": 0.42454705238342283, + "mean_token_accuracy": 0.8697500795125961, + "num_tokens": 4102962.0, + "step": 1460 + }, + { + "epoch": 3.5186972255729794, + "eval_entropy": 0.49202995219927154, + "eval_loss": 0.5311718583106995, + "eval_mean_token_accuracy": 0.851506720767932, + "eval_num_tokens": 4102962.0, + "eval_runtime": 55.2726, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 1460 + }, + { + "entropy": 0.48378537222743034, + "epoch": 3.5669481302774426, + "grad_norm": 0.5555779933929443, + "learning_rate": 0.00019275167775370967, + "loss": 0.4371222496032715, + "mean_token_accuracy": 0.8689292743802071, + "num_tokens": 4157529.0, + "step": 1480 + }, + { + "epoch": 3.5669481302774426, + "eval_entropy": 0.48221782820948056, + "eval_loss": 0.5330429077148438, + "eval_mean_token_accuracy": 0.8519372578417317, + "eval_num_tokens": 4157529.0, + "eval_runtime": 55.2868, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 1480 + }, + { + "entropy": 0.4857771031558514, + "epoch": 3.615199034981906, + "grad_norm": 0.4876253604888916, + "learning_rate": 0.00019118499975831547, + "loss": 0.43259029388427733, + "mean_token_accuracy": 0.8682043462991714, + "num_tokens": 4211781.0, + "step": 1500 + }, + { + "epoch": 3.615199034981906, + "eval_entropy": 0.4850948933470115, + "eval_loss": 0.5299601554870605, + "eval_mean_token_accuracy": 0.8528794411862834, + "eval_num_tokens": 4211781.0, + "eval_runtime": 55.2698, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 1500 + }, + { + "entropy": 0.4771463945508003, + "epoch": 3.663449939686369, + "grad_norm": 0.6399746537208557, + "learning_rate": 0.0001895977742639954, + "loss": 0.4305243968963623, + "mean_token_accuracy": 0.8694027632474899, + "num_tokens": 4263964.0, + "step": 1520 + }, + { + "epoch": 3.663449939686369, + "eval_entropy": 0.5143825498859534, + "eval_loss": 0.529093325138092, + "eval_mean_token_accuracy": 0.8517658697085434, + "eval_num_tokens": 4263964.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1520 + }, + { + "entropy": 0.474514215439558, + "epoch": 3.7117008443908324, + "grad_norm": 0.3690730631351471, + "learning_rate": 0.0001879904504367892, + "loss": 0.42963576316833496, + "mean_token_accuracy": 0.8720991492271424, + "num_tokens": 4321418.0, + "step": 1540 + }, + { + "epoch": 3.7117008443908324, + "eval_entropy": 0.5014224137817875, + "eval_loss": 0.5249797105789185, + "eval_mean_token_accuracy": 0.8529440539606502, + "eval_num_tokens": 4321418.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1540 + }, + { + "entropy": 0.47753885164856913, + "epoch": 3.7599517490952956, + "grad_norm": 0.4569166302680969, + "learning_rate": 0.0001863634831303272, + "loss": 0.4295980453491211, + "mean_token_accuracy": 0.8683714866638184, + "num_tokens": 4376266.0, + "step": 1560 + }, + { + "epoch": 3.7599517490952956, + "eval_entropy": 0.5019658906071374, + "eval_loss": 0.5246453285217285, + "eval_mean_token_accuracy": 0.8528274043222491, + "eval_num_tokens": 4376266.0, + "eval_runtime": 55.2623, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 1560 + }, + { + "entropy": 0.4871028944849968, + "epoch": 3.808202653799759, + "grad_norm": 0.5299158692359924, + "learning_rate": 0.00018471733275711197, + "loss": 0.43556942939758303, + "mean_token_accuracy": 0.8667704507708549, + "num_tokens": 4431183.0, + "step": 1580 + }, + { + "epoch": 3.808202653799759, + "eval_entropy": 0.49056559499729885, + "eval_loss": 0.5267402529716492, + "eval_mean_token_accuracy": 0.8529468617412481, + "eval_num_tokens": 4431183.0, + "eval_runtime": 55.2425, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 1580 + }, + { + "entropy": 0.48621814996004104, + "epoch": 3.856453558504222, + "grad_norm": 0.5417211055755615, + "learning_rate": 0.00018305246515822705, + "loss": 0.4356864929199219, + "mean_token_accuracy": 0.866399897634983, + "num_tokens": 4487602.0, + "step": 1600 + }, + { + "epoch": 3.856453558504222, + "eval_entropy": 0.4862406144985992, + "eval_loss": 0.528841495513916, + "eval_mean_token_accuracy": 0.8527089732416561, + "eval_num_tokens": 4487602.0, + "eval_runtime": 55.2578, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 1600 + }, + { + "entropy": 0.4812369517982006, + "epoch": 3.9047044632086854, + "grad_norm": 0.4729803204536438, + "learning_rate": 0.00018136935147150939, + "loss": 0.4373537540435791, + "mean_token_accuracy": 0.8668754518032074, + "num_tokens": 4544204.0, + "step": 1620 + }, + { + "epoch": 3.9047044632086854, + "eval_entropy": 0.49240104602963736, + "eval_loss": 0.5234901309013367, + "eval_mean_token_accuracy": 0.8531327271059658, + "eval_num_tokens": 4544204.0, + "eval_runtime": 55.2738, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1620 + }, + { + "entropy": 0.46881823167204856, + "epoch": 3.952955367913148, + "grad_norm": 0.6060121059417725, + "learning_rate": 0.00017966846799822304, + "loss": 0.4178919792175293, + "mean_token_accuracy": 0.873378013074398, + "num_tokens": 4601329.0, + "step": 1640 + }, + { + "epoch": 3.952955367913148, + "eval_entropy": 0.49871147733725857, + "eval_loss": 0.5233765244483948, + "eval_mean_token_accuracy": 0.8536087632848975, + "eval_num_tokens": 4601329.0, + "eval_runtime": 55.2741, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 1640 + }, + { + "entropy": 0.48498370402898544, + "epoch": 4.0, + "grad_norm": 2.052300214767456, + "learning_rate": 0.00017795029606827148, + "loss": 0.44376530647277834, + "mean_token_accuracy": 0.8655372567665882, + "num_tokens": 4654752.0, + "step": 1660 + }, + { + "epoch": 4.0, + "eval_entropy": 0.49232733165949916, + "eval_loss": 0.5222684741020203, + "eval_mean_token_accuracy": 0.8533160967773266, + "eval_num_tokens": 4654752.0, + "eval_runtime": 55.2756, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1660 + }, + { + "entropy": 0.4217598669230938, + "epoch": 4.048250904704463, + "grad_norm": 0.38481971621513367, + "learning_rate": 0.00017621532190398683, + "loss": 0.35839712619781494, + "mean_token_accuracy": 0.886622816324234, + "num_tokens": 4710934.0, + "step": 1680 + }, + { + "epoch": 4.048250904704463, + "eval_entropy": 0.4602571583530876, + "eval_loss": 0.5400357246398926, + "eval_mean_token_accuracy": 0.8520095864038789, + "eval_num_tokens": 4710934.0, + "eval_runtime": 55.2755, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 1680 + }, + { + "entropy": 0.415022599697113, + "epoch": 4.0965018094089265, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00017446403648253478, + "loss": 0.36227600574493407, + "mean_token_accuracy": 0.8860071450471878, + "num_tokens": 4768377.0, + "step": 1700 + }, + { + "epoch": 4.0965018094089265, + "eval_entropy": 0.45031814829687056, + "eval_loss": 0.5422973036766052, + "eval_mean_token_accuracy": 0.852367355917277, + "eval_num_tokens": 4768377.0, + "eval_runtime": 55.2798, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1700 + }, + { + "entropy": 0.40994974672794343, + "epoch": 4.144752714113389, + "grad_norm": 0.5344891548156738, + "learning_rate": 0.00017269693539697395, + "loss": 0.36210730075836184, + "mean_token_accuracy": 0.8847656399011612, + "num_tokens": 4826422.0, + "step": 1720 + }, + { + "epoch": 4.144752714113389, + "eval_entropy": 0.4408789911631788, + "eval_loss": 0.54451584815979, + "eval_mean_token_accuracy": 0.8525006958607877, + "eval_num_tokens": 4826422.0, + "eval_runtime": 55.2803, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 1720 + }, + { + "entropy": 0.4054971173405647, + "epoch": 4.193003618817853, + "grad_norm": 0.49729013442993164, + "learning_rate": 0.00017091451871600871, + "loss": 0.3581687927246094, + "mean_token_accuracy": 0.8884050786495209, + "num_tokens": 4883742.0, + "step": 1740 + }, + { + "epoch": 4.193003618817853, + "eval_entropy": 0.4421394058827604, + "eval_loss": 0.5438067317008972, + "eval_mean_token_accuracy": 0.8522373243664088, + "eval_num_tokens": 4883742.0, + "eval_runtime": 55.3273, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 1740 + }, + { + "entropy": 0.408981966227293, + "epoch": 4.241254523522316, + "grad_norm": 0.5137715339660645, + "learning_rate": 0.00016911729084247588, + "loss": 0.35566527843475343, + "mean_token_accuracy": 0.8870424538850784, + "num_tokens": 4939047.0, + "step": 1760 + }, + { + "epoch": 4.241254523522316, + "eval_entropy": 0.437940045856358, + "eval_loss": 0.5455656051635742, + "eval_mean_token_accuracy": 0.8521432715855287, + "eval_num_tokens": 4939047.0, + "eval_runtime": 55.3505, + "eval_samples_per_second": 25.655, + "eval_steps_per_second": 3.216, + "step": 1760 + }, + { + "entropy": 0.41717352643609046, + "epoch": 4.2895054282267795, + "grad_norm": 0.6072068214416504, + "learning_rate": 0.00016730576037060445, + "loss": 0.3726978778839111, + "mean_token_accuracy": 0.8857960850000381, + "num_tokens": 4995084.0, + "step": 1780 + }, + { + "epoch": 4.2895054282267795, + "eval_entropy": 0.4539307618743918, + "eval_loss": 0.5401874780654907, + "eval_mean_token_accuracy": 0.8514684432008294, + "eval_num_tokens": 4995084.0, + "eval_runtime": 55.3542, + "eval_samples_per_second": 25.653, + "eval_steps_per_second": 3.216, + "step": 1780 + }, + { + "entropy": 0.4127464734017849, + "epoch": 4.337756332931242, + "grad_norm": 0.5430658459663391, + "learning_rate": 0.00016548043994208964, + "loss": 0.3644162654876709, + "mean_token_accuracy": 0.8823994249105453, + "num_tokens": 5053068.0, + "step": 1800 + }, + { + "epoch": 4.337756332931242, + "eval_entropy": 0.44713982069090513, + "eval_loss": 0.5412749648094177, + "eval_mean_token_accuracy": 0.852779678414377, + "eval_num_tokens": 5053068.0, + "eval_runtime": 55.3611, + "eval_samples_per_second": 25.65, + "eval_steps_per_second": 3.215, + "step": 1800 + }, + { + "entropy": 0.41877189204096793, + "epoch": 4.386007237635706, + "grad_norm": 0.5927475690841675, + "learning_rate": 0.0001636418461010213, + "loss": 0.3683622360229492, + "mean_token_accuracy": 0.8833754420280456, + "num_tokens": 5104761.0, + "step": 1820 + }, + { + "epoch": 4.386007237635706, + "eval_entropy": 0.46290029468161337, + "eval_loss": 0.5373201966285706, + "eval_mean_token_accuracy": 0.8519754731253292, + "eval_num_tokens": 5104761.0, + "eval_runtime": 55.366, + "eval_samples_per_second": 25.647, + "eval_steps_per_second": 3.215, + "step": 1820 + }, + { + "entropy": 0.41361497789621354, + "epoch": 4.434258142340169, + "grad_norm": 0.42524710297584534, + "learning_rate": 0.0001617904991477079, + "loss": 0.36388933658599854, + "mean_token_accuracy": 0.8857789531350135, + "num_tokens": 5160976.0, + "step": 1840 + }, + { + "epoch": 4.434258142340169, + "eval_entropy": 0.454606260811345, + "eval_loss": 0.5357740521430969, + "eval_mean_token_accuracy": 0.8534008217661568, + "eval_num_tokens": 5160976.0, + "eval_runtime": 55.3257, + "eval_samples_per_second": 25.666, + "eval_steps_per_second": 3.217, + "step": 1840 + }, + { + "entropy": 0.4072607338428497, + "epoch": 4.4825090470446325, + "grad_norm": 0.5685698390007019, + "learning_rate": 0.00015992692299143796, + "loss": 0.36304988861083987, + "mean_token_accuracy": 0.8849645286798478, + "num_tokens": 5220453.0, + "step": 1860 + }, + { + "epoch": 4.4825090470446325, + "eval_entropy": 0.45586093925358206, + "eval_loss": 0.5363942980766296, + "eval_mean_token_accuracy": 0.8529877264178201, + "eval_num_tokens": 5220453.0, + "eval_runtime": 55.3204, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 1860 + }, + { + "entropy": 0.41214886382222177, + "epoch": 4.530759951749095, + "grad_norm": 0.5072870254516602, + "learning_rate": 0.00015805164500221977, + "loss": 0.3678156852722168, + "mean_token_accuracy": 0.8866965472698212, + "num_tokens": 5276552.0, + "step": 1880 + }, + { + "epoch": 4.530759951749095, + "eval_entropy": 0.45211532125982007, + "eval_loss": 0.5386014580726624, + "eval_mean_token_accuracy": 0.8531195494566071, + "eval_num_tokens": 5276552.0, + "eval_runtime": 55.3385, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1880 + }, + { + "entropy": 0.4329418152570724, + "epoch": 4.579010856453559, + "grad_norm": 0.5007720589637756, + "learning_rate": 0.00015616519586154177, + "loss": 0.38508875370025636, + "mean_token_accuracy": 0.880007627606392, + "num_tokens": 5329807.0, + "step": 1900 + }, + { + "epoch": 4.579010856453559, + "eval_entropy": 0.4555856212136451, + "eval_loss": 0.5313775539398193, + "eval_mean_token_accuracy": 0.8542753206879905, + "eval_num_tokens": 5329807.0, + "eval_runtime": 55.3792, + "eval_samples_per_second": 25.641, + "eval_steps_per_second": 3.214, + "step": 1900 + }, + { + "entropy": 0.40598013177514075, + "epoch": 4.627261761158022, + "grad_norm": 0.561281144618988, + "learning_rate": 0.00015426810941219628, + "loss": 0.35770084857940676, + "mean_token_accuracy": 0.8866653427481651, + "num_tokens": 5387697.0, + "step": 1920 + }, + { + "epoch": 4.627261761158022, + "eval_entropy": 0.4488721307408944, + "eval_loss": 0.5346855521202087, + "eval_mean_token_accuracy": 0.8540940257940399, + "eval_num_tokens": 5387697.0, + "eval_runtime": 55.342, + "eval_samples_per_second": 25.659, + "eval_steps_per_second": 3.216, + "step": 1920 + }, + { + "entropy": 0.42046323865652085, + "epoch": 4.675512665862485, + "grad_norm": 0.532781183719635, + "learning_rate": 0.0001523609225072081, + "loss": 0.3753895044326782, + "mean_token_accuracy": 0.8846079766750335, + "num_tokens": 5442617.0, + "step": 1940 + }, + { + "epoch": 4.675512665862485, + "eval_entropy": 0.4516103354732642, + "eval_loss": 0.5331831574440002, + "eval_mean_token_accuracy": 0.8537958291139496, + "eval_num_tokens": 5442617.0, + "eval_runtime": 55.3381, + "eval_samples_per_second": 25.66, + "eval_steps_per_second": 3.217, + "step": 1940 + }, + { + "entropy": 0.4248813711106777, + "epoch": 4.723763570566948, + "grad_norm": 0.573076069355011, + "learning_rate": 0.0001504441748579115, + "loss": 0.37468433380126953, + "mean_token_accuracy": 0.8810012340545654, + "num_tokens": 5498376.0, + "step": 1960 + }, + { + "epoch": 4.723763570566948, + "eval_entropy": 0.45351154650195263, + "eval_loss": 0.5313496589660645, + "eval_mean_token_accuracy": 0.8540389561251308, + "eval_num_tokens": 5498376.0, + "eval_runtime": 55.314, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 1960 + }, + { + "entropy": 0.42766154184937477, + "epoch": 4.772014475271411, + "grad_norm": 0.6360363960266113, + "learning_rate": 0.0001485184088812183, + "loss": 0.3759138584136963, + "mean_token_accuracy": 0.8832121655344963, + "num_tokens": 5550617.0, + "step": 1980 + }, + { + "epoch": 4.772014475271411, + "eval_entropy": 0.45847221890862067, + "eval_loss": 0.5307087302207947, + "eval_mean_token_accuracy": 0.8554085097955854, + "eval_num_tokens": 5550617.0, + "eval_runtime": 55.2786, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 1980 + }, + { + "entropy": 0.43023054748773576, + "epoch": 4.820265379975875, + "grad_norm": 0.5118623971939087, + "learning_rate": 0.00014658416954612026, + "loss": 0.3791791915893555, + "mean_token_accuracy": 0.8808756858110428, + "num_tokens": 5606610.0, + "step": 2000 + }, + { + "epoch": 4.820265379975875, + "eval_entropy": 0.44867819112338375, + "eval_loss": 0.5335640907287598, + "eval_mean_token_accuracy": 0.8539478557833126, + "eval_num_tokens": 5606610.0, + "eval_runtime": 55.2773, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2000 + }, + { + "entropy": 0.4132154494524002, + "epoch": 4.868516284680338, + "grad_norm": 0.507412850856781, + "learning_rate": 0.00014464200421946937, + "loss": 0.3699758768081665, + "mean_token_accuracy": 0.8830380782485008, + "num_tokens": 5665904.0, + "step": 2020 + }, + { + "epoch": 4.868516284680338, + "eval_entropy": 0.447213868579168, + "eval_loss": 0.5322631597518921, + "eval_mean_token_accuracy": 0.8543167797367225, + "eval_num_tokens": 5665904.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2020 + }, + { + "entropy": 0.42996141090989115, + "epoch": 4.916767189384801, + "grad_norm": 0.5200228095054626, + "learning_rate": 0.00014269246251107944, + "loss": 0.3860961675643921, + "mean_token_accuracy": 0.8778340086340904, + "num_tokens": 5719363.0, + "step": 2040 + }, + { + "epoch": 4.916767189384801, + "eval_entropy": 0.4490728845422188, + "eval_loss": 0.5343455076217651, + "eval_mean_token_accuracy": 0.854227306132906, + "eval_num_tokens": 5719363.0, + "eval_runtime": 55.2678, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2040 + }, + { + "entropy": 0.42904197499156, + "epoch": 4.965018094089264, + "grad_norm": 0.679582417011261, + "learning_rate": 0.0001407360961181932, + "loss": 0.38086695671081544, + "mean_token_accuracy": 0.8815151125192642, + "num_tokens": 5772021.0, + "step": 2060 + }, + { + "epoch": 4.965018094089264, + "eval_entropy": 0.45351302489805756, + "eval_loss": 0.5304082036018372, + "eval_mean_token_accuracy": 0.8544826795545857, + "eval_num_tokens": 5772021.0, + "eval_runtime": 55.2887, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2060 + }, + { + "entropy": 0.3950294531308688, + "epoch": 5.012062726176116, + "grad_norm": 0.4820767641067505, + "learning_rate": 0.00013877345866935813, + "loss": 0.34286065101623536, + "mean_token_accuracy": 0.891666068480565, + "num_tokens": 5831937.0, + "step": 2080 + }, + { + "epoch": 5.012062726176116, + "eval_entropy": 0.40521935934431097, + "eval_loss": 0.5571053624153137, + "eval_mean_token_accuracy": 0.8532105804829115, + "eval_num_tokens": 5831937.0, + "eval_runtime": 55.2721, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2080 + }, + { + "entropy": 0.3518491223454475, + "epoch": 5.060313630880579, + "grad_norm": 0.5651171803474426, + "learning_rate": 0.00013680510556775657, + "loss": 0.2968994855880737, + "mean_token_accuracy": 0.9036217927932739, + "num_tokens": 5885628.0, + "step": 2100 + }, + { + "epoch": 5.060313630880579, + "eval_entropy": 0.3908112795835131, + "eval_loss": 0.5718717575073242, + "eval_mean_token_accuracy": 0.8517157733440399, + "eval_num_tokens": 5885628.0, + "eval_runtime": 55.3121, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2100 + }, + { + "entropy": 0.3508297473192215, + "epoch": 5.108564535585042, + "grad_norm": 0.7483569979667664, + "learning_rate": 0.00013483159383403286, + "loss": 0.2943051815032959, + "mean_token_accuracy": 0.9050277039408684, + "num_tokens": 5938273.0, + "step": 2120 + }, + { + "epoch": 5.108564535585042, + "eval_entropy": 0.3922154439634152, + "eval_loss": 0.5800208449363708, + "eval_mean_token_accuracy": 0.849590765626243, + "eval_num_tokens": 5938273.0, + "eval_runtime": 55.3062, + "eval_samples_per_second": 25.675, + "eval_steps_per_second": 3.218, + "step": 2120 + }, + { + "entropy": 0.3514078348875046, + "epoch": 5.156815440289505, + "grad_norm": 0.5905992388725281, + "learning_rate": 0.00013285348194866324, + "loss": 0.30216853618621825, + "mean_token_accuracy": 0.9038319244980813, + "num_tokens": 5994067.0, + "step": 2140 + }, + { + "epoch": 5.156815440289505, + "eval_entropy": 0.40027610237678785, + "eval_loss": 0.5710272789001465, + "eval_mean_token_accuracy": 0.8524003802390581, + "eval_num_tokens": 5994067.0, + "eval_runtime": 55.3115, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 2140 + }, + { + "entropy": 0.36622492372989657, + "epoch": 5.205066344993969, + "grad_norm": 0.4692232012748718, + "learning_rate": 0.00013087132969391246, + "loss": 0.307849645614624, + "mean_token_accuracy": 0.9014044284820557, + "num_tokens": 6047358.0, + "step": 2160 + }, + { + "epoch": 5.205066344993969, + "eval_entropy": 0.4034242611587717, + "eval_loss": 0.5697806477546692, + "eval_mean_token_accuracy": 0.8520179243569963, + "eval_num_tokens": 6047358.0, + "eval_runtime": 55.2722, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 2160 + }, + { + "entropy": 0.3508648067712784, + "epoch": 5.253317249698432, + "grad_norm": 0.7615344524383545, + "learning_rate": 0.0001288856979954221, + "loss": 0.29553372859954835, + "mean_token_accuracy": 0.9034190520644187, + "num_tokens": 6106644.0, + "step": 2180 + }, + { + "epoch": 5.253317249698432, + "eval_entropy": 0.3921829242719693, + "eval_loss": 0.5691862106323242, + "eval_mean_token_accuracy": 0.8526006635655178, + "eval_num_tokens": 6106644.0, + "eval_runtime": 55.3002, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 2180 + }, + { + "entropy": 0.3519009813666344, + "epoch": 5.301568154402895, + "grad_norm": 0.7679558992385864, + "learning_rate": 0.00012689714876347493, + "loss": 0.29419128894805907, + "mean_token_accuracy": 0.9031866028904915, + "num_tokens": 6160311.0, + "step": 2200 + }, + { + "epoch": 5.301568154402895, + "eval_entropy": 0.39013911599523565, + "eval_loss": 0.5767874121665955, + "eval_mean_token_accuracy": 0.8514098655641749, + "eval_num_tokens": 6160311.0, + "eval_runtime": 55.2892, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2200 + }, + { + "entropy": 0.34301606491208075, + "epoch": 5.349819059107358, + "grad_norm": 0.6707648634910583, + "learning_rate": 0.0001249062447339814, + "loss": 0.29520268440246583, + "mean_token_accuracy": 0.903484332561493, + "num_tokens": 6218747.0, + "step": 2220 + }, + { + "epoch": 5.349819059107358, + "eval_entropy": 0.3892528228880314, + "eval_loss": 0.5764396786689758, + "eval_mean_token_accuracy": 0.8511234943786364, + "eval_num_tokens": 6218747.0, + "eval_runtime": 55.2573, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2220 + }, + { + "entropy": 0.35039012134075165, + "epoch": 5.398069963811821, + "grad_norm": 0.64646315574646, + "learning_rate": 0.00012291354930923175, + "loss": 0.3015883207321167, + "mean_token_accuracy": 0.9020207405090332, + "num_tokens": 6274591.0, + "step": 2240 + }, + { + "epoch": 5.398069963811821, + "eval_entropy": 0.39754635162567825, + "eval_loss": 0.5652341842651367, + "eval_mean_token_accuracy": 0.8526410135660278, + "eval_num_tokens": 6274591.0, + "eval_runtime": 55.2883, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.219, + "step": 2240 + }, + { + "entropy": 0.3538553349673748, + "epoch": 5.446320868516285, + "grad_norm": 0.760821521282196, + "learning_rate": 0.00012091962639845982, + "loss": 0.303299617767334, + "mean_token_accuracy": 0.9028412505984307, + "num_tokens": 6328661.0, + "step": 2260 + }, + { + "epoch": 5.446320868516285, + "eval_entropy": 0.39631325575742826, + "eval_loss": 0.5641883611679077, + "eval_mean_token_accuracy": 0.8536281538813302, + "eval_num_tokens": 6328661.0, + "eval_runtime": 55.3024, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 2260 + }, + { + "entropy": 0.3555382125079632, + "epoch": 5.4945717732207475, + "grad_norm": 0.6750470399856567, + "learning_rate": 0.00011892504025826358, + "loss": 0.30209062099456785, + "mean_token_accuracy": 0.9013994172215462, + "num_tokens": 6383970.0, + "step": 2280 + }, + { + "epoch": 5.4945717732207475, + "eval_entropy": 0.39563166810555406, + "eval_loss": 0.5636888146400452, + "eval_mean_token_accuracy": 0.8531888388515858, + "eval_num_tokens": 6383970.0, + "eval_runtime": 55.2944, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2280 + }, + { + "entropy": 0.3454573631286621, + "epoch": 5.542822677925211, + "grad_norm": 0.7239598035812378, + "learning_rate": 0.00011693035533292696, + "loss": 0.29387383460998534, + "mean_token_accuracy": 0.9050945967435837, + "num_tokens": 6442499.0, + "step": 2300 + }, + { + "epoch": 5.542822677925211, + "eval_entropy": 0.392980629306161, + "eval_loss": 0.5626416802406311, + "eval_mean_token_accuracy": 0.8542174549584978, + "eval_num_tokens": 6442499.0, + "eval_runtime": 55.2624, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 2300 + }, + { + "entropy": 0.3474574498832226, + "epoch": 5.591073582629674, + "grad_norm": 0.6282244324684143, + "learning_rate": 0.00011493613609468904, + "loss": 0.3063398599624634, + "mean_token_accuracy": 0.9010455697774887, + "num_tokens": 6503184.0, + "step": 2320 + }, + { + "epoch": 5.591073582629674, + "eval_entropy": 0.4008133021298419, + "eval_loss": 0.5614317655563354, + "eval_mean_token_accuracy": 0.8536901155884347, + "eval_num_tokens": 6503184.0, + "eval_runtime": 55.289, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2320 + }, + { + "entropy": 0.3517454981803894, + "epoch": 5.639324487334138, + "grad_norm": 0.5463727116584778, + "learning_rate": 0.00011294294688400486, + "loss": 0.30131995677948, + "mean_token_accuracy": 0.9020274326205253, + "num_tokens": 6562777.0, + "step": 2340 + }, + { + "epoch": 5.639324487334138, + "eval_entropy": 0.4002562404683467, + "eval_loss": 0.5606418251991272, + "eval_mean_token_accuracy": 0.8533886798312155, + "eval_num_tokens": 6562777.0, + "eval_runtime": 55.2831, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 2340 + }, + { + "entropy": 0.35189504325389864, + "epoch": 5.6875753920386005, + "grad_norm": 0.6368454098701477, + "learning_rate": 0.00011095135174984394, + "loss": 0.3063028812408447, + "mean_token_accuracy": 0.902279743552208, + "num_tokens": 6622512.0, + "step": 2360 + }, + { + "epoch": 5.6875753920386005, + "eval_entropy": 0.4084411474426141, + "eval_loss": 0.55985027551651, + "eval_mean_token_accuracy": 0.8527230613687066, + "eval_num_tokens": 6622512.0, + "eval_runtime": 55.2785, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2360 + }, + { + "entropy": 0.35596260875463487, + "epoch": 5.735826296743064, + "grad_norm": 0.5473037362098694, + "learning_rate": 0.00010896191429007085, + "loss": 0.30281500816345214, + "mean_token_accuracy": 0.9030452728271484, + "num_tokens": 6676643.0, + "step": 2380 + }, + { + "epoch": 5.735826296743064, + "eval_entropy": 0.39605340093709107, + "eval_loss": 0.5619694590568542, + "eval_mean_token_accuracy": 0.8537970248902782, + "eval_num_tokens": 6676643.0, + "eval_runtime": 55.2458, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2380 + }, + { + "entropy": 0.3503343403339386, + "epoch": 5.784077201447527, + "grad_norm": 0.5703344941139221, + "learning_rate": 0.00010697519749195404, + "loss": 0.30224013328552246, + "mean_token_accuracy": 0.901669493317604, + "num_tokens": 6733574.0, + "step": 2400 + }, + { + "epoch": 5.784077201447527, + "eval_entropy": 0.40314525566744003, + "eval_loss": 0.5587471723556519, + "eval_mean_token_accuracy": 0.8520790084024493, + "eval_num_tokens": 6733574.0, + "eval_runtime": 55.2455, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 2400 + }, + { + "entropy": 0.34604763686656953, + "epoch": 5.832328106151991, + "grad_norm": 0.6742275953292847, + "learning_rate": 0.00010499176357284669, + "loss": 0.30106277465820314, + "mean_token_accuracy": 0.9020058959722519, + "num_tokens": 6790549.0, + "step": 2420 + }, + { + "epoch": 5.832328106151991, + "eval_entropy": 0.3899915837839748, + "eval_loss": 0.5612479448318481, + "eval_mean_token_accuracy": 0.8548604069131144, + "eval_num_tokens": 6790549.0, + "eval_runtime": 55.3052, + "eval_samples_per_second": 25.676, + "eval_steps_per_second": 3.219, + "step": 2420 + }, + { + "entropy": 0.3510720990598202, + "epoch": 5.8805790108564535, + "grad_norm": 0.6276938319206238, + "learning_rate": 0.00010301217382108624, + "loss": 0.3025418043136597, + "mean_token_accuracy": 0.9027798771858215, + "num_tokens": 6845043.0, + "step": 2440 + }, + { + "epoch": 5.8805790108564535, + "eval_entropy": 0.3858113387662373, + "eval_loss": 0.5640541315078735, + "eval_mean_token_accuracy": 0.8541722240742673, + "eval_num_tokens": 6845043.0, + "eval_runtime": 55.329, + "eval_samples_per_second": 25.665, + "eval_steps_per_second": 3.217, + "step": 2440 + }, + { + "entropy": 0.3422310143709183, + "epoch": 5.928829915560916, + "grad_norm": 0.6075900197029114, + "learning_rate": 0.00010103698843715608, + "loss": 0.2961073160171509, + "mean_token_accuracy": 0.9035829156637192, + "num_tokens": 6899831.0, + "step": 2460 + }, + { + "epoch": 5.928829915560916, + "eval_entropy": 0.3931033756960644, + "eval_loss": 0.5563910603523254, + "eval_mean_token_accuracy": 0.8541168610701401, + "eval_num_tokens": 6899831.0, + "eval_runtime": 55.3187, + "eval_samples_per_second": 25.669, + "eval_steps_per_second": 3.218, + "step": 2460 + }, + { + "entropy": 0.34767043516039847, + "epoch": 5.97708082026538, + "grad_norm": 0.7821509838104248, + "learning_rate": 9.906676637515565e-05, + "loss": 0.2965876579284668, + "mean_token_accuracy": 0.9043371796607971, + "num_tokens": 6956048.0, + "step": 2480 + }, + { + "epoch": 5.97708082026538, + "eval_entropy": 0.3876880623316497, + "eval_loss": 0.5590454936027527, + "eval_mean_token_accuracy": 0.8546800589963292, + "eval_num_tokens": 6956048.0, + "eval_runtime": 55.3153, + "eval_samples_per_second": 25.671, + "eval_steps_per_second": 3.218, + "step": 2480 + }, + { + "entropy": 0.30925769530809843, + "epoch": 6.024125452352232, + "grad_norm": 0.8434519171714783, + "learning_rate": 9.71020651846231e-05, + "loss": 0.2541666507720947, + "mean_token_accuracy": 0.9174298537083161, + "num_tokens": 7014287.0, + "step": 2500 + }, + { + "epoch": 6.024125452352232, + "eval_entropy": 0.33969055735663084, + "eval_loss": 0.6305586099624634, + "eval_mean_token_accuracy": 0.8488262406225955, + "eval_num_tokens": 7014287.0, + "eval_runtime": 55.299, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 2500 + }, + { + "entropy": 0.27741119749844073, + "epoch": 6.072376357056695, + "grad_norm": 0.7920182943344116, + "learning_rate": 9.514344085275508e-05, + "loss": 0.21898913383483887, + "mean_token_accuracy": 0.9291795000433922, + "num_tokens": 7067527.0, + "step": 2520 + }, + { + "epoch": 6.072376357056695, + "eval_entropy": 0.3445688337087631, + "eval_loss": 0.613576352596283, + "eval_mean_token_accuracy": 0.8510002580921302, + "eval_num_tokens": 7067527.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2520 + }, + { + "entropy": 0.2773283515125513, + "epoch": 6.120627261761158, + "grad_norm": 0.7460144758224487, + "learning_rate": 9.31914476470693e-05, + "loss": 0.2153709650039673, + "mean_token_accuracy": 0.929512245953083, + "num_tokens": 7120104.0, + "step": 2540 + }, + { + "epoch": 6.120627261761158, + "eval_entropy": 0.3431067659278934, + "eval_loss": 0.6223914623260498, + "eval_mean_token_accuracy": 0.8489464427647966, + "eval_num_tokens": 7120104.0, + "eval_runtime": 55.3127, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 2540 + }, + { + "entropy": 0.28125015236437323, + "epoch": 6.168878166465621, + "grad_norm": 0.6596947908401489, + "learning_rate": 9.124663795855309e-05, + "loss": 0.22263822555541993, + "mean_token_accuracy": 0.9263416901230812, + "num_tokens": 7173775.0, + "step": 2560 + }, + { + "epoch": 6.168878166465621, + "eval_entropy": 0.3495907724908229, + "eval_loss": 0.6132948994636536, + "eval_mean_token_accuracy": 0.8493377057354102, + "eval_num_tokens": 7173775.0, + "eval_runtime": 55.2764, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 2560 + }, + { + "entropy": 0.2749589327722788, + "epoch": 6.217129071170085, + "grad_norm": 0.6811111569404602, + "learning_rate": 8.930956214534336e-05, + "loss": 0.2155080556869507, + "mean_token_accuracy": 0.9284482330083847, + "num_tokens": 7229823.0, + "step": 2580 + }, + { + "epoch": 6.217129071170085, + "eval_entropy": 0.3410603646816832, + "eval_loss": 0.6241295337677002, + "eval_mean_token_accuracy": 0.8500961688127411, + "eval_num_tokens": 7229823.0, + "eval_runtime": 55.2676, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 2580 + }, + { + "entropy": 0.2811116900295019, + "epoch": 6.265379975874548, + "grad_norm": 0.7649447917938232, + "learning_rate": 8.738076837698193e-05, + "loss": 0.2263277769088745, + "mean_token_accuracy": 0.9255106285214424, + "num_tokens": 7287356.0, + "step": 2600 + }, + { + "epoch": 6.265379975874548, + "eval_entropy": 0.34441549171892444, + "eval_loss": 0.6095326542854309, + "eval_mean_token_accuracy": 0.8520389209302623, + "eval_num_tokens": 7287356.0, + "eval_runtime": 55.2635, + "eval_samples_per_second": 25.695, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "entropy": 0.27435422986745833, + "epoch": 6.3136308805790105, + "grad_norm": 0.6525525450706482, + "learning_rate": 8.546080247928975e-05, + "loss": 0.2196337938308716, + "mean_token_accuracy": 0.9278794303536415, + "num_tokens": 7345818.0, + "step": 2620 + }, + { + "epoch": 6.3136308805790105, + "eval_entropy": 0.3514235028055277, + "eval_loss": 0.6111680865287781, + "eval_mean_token_accuracy": 0.8514190537206242, + "eval_num_tokens": 7345818.0, + "eval_runtime": 55.2942, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 2620 + }, + { + "entropy": 0.27057958133518695, + "epoch": 6.361881785283474, + "grad_norm": 0.7869387269020081, + "learning_rate": 8.355020777990492e-05, + "loss": 0.21130192279815674, + "mean_token_accuracy": 0.9296034276485443, + "num_tokens": 7406978.0, + "step": 2640 + }, + { + "epoch": 6.361881785283474, + "eval_entropy": 0.3361843256803041, + "eval_loss": 0.6246429681777954, + "eval_mean_token_accuracy": 0.8500659284966715, + "eval_num_tokens": 7406978.0, + "eval_runtime": 55.2901, + "eval_samples_per_second": 25.683, + "eval_steps_per_second": 3.219, + "step": 2640 + }, + { + "entropy": 0.2786216359585524, + "epoch": 6.410132689987937, + "grad_norm": 0.7876622676849365, + "learning_rate": 8.164952495452717e-05, + "loss": 0.2234494209289551, + "mean_token_accuracy": 0.9263992309570312, + "num_tokens": 7462212.0, + "step": 2660 + }, + { + "epoch": 6.410132689987937, + "eval_entropy": 0.3445214969053697, + "eval_loss": 0.6168352961540222, + "eval_mean_token_accuracy": 0.8508294469185089, + "eval_num_tokens": 7462212.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 2660 + }, + { + "entropy": 0.27971002720296384, + "epoch": 6.458383594692401, + "grad_norm": 0.7404258847236633, + "learning_rate": 7.975929187391317e-05, + "loss": 0.22412145137786865, + "mean_token_accuracy": 0.9254573807120323, + "num_tokens": 7519697.0, + "step": 2680 + }, + { + "epoch": 6.458383594692401, + "eval_entropy": 0.3495463337121385, + "eval_loss": 0.6165894269943237, + "eval_mean_token_accuracy": 0.8501586800210932, + "eval_num_tokens": 7519697.0, + "eval_runtime": 55.2778, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 2680 + }, + { + "entropy": 0.28011396154761314, + "epoch": 6.5066344993968634, + "grad_norm": 0.6830134391784668, + "learning_rate": 7.788004345166545e-05, + "loss": 0.22303051948547364, + "mean_token_accuracy": 0.9269484728574753, + "num_tokens": 7574834.0, + "step": 2700 + }, + { + "epoch": 6.5066344993968634, + "eval_entropy": 0.34271225785271503, + "eval_loss": 0.615679144859314, + "eval_mean_token_accuracy": 0.8512428282351976, + "eval_num_tokens": 7574834.0, + "eval_runtime": 55.2417, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2700 + }, + { + "entropy": 0.27497340776026247, + "epoch": 6.554885404101327, + "grad_norm": 0.6799295544624329, + "learning_rate": 7.601231149285811e-05, + "loss": 0.2222221851348877, + "mean_token_accuracy": 0.9258430942893028, + "num_tokens": 7627577.0, + "step": 2720 + }, + { + "epoch": 6.554885404101327, + "eval_entropy": 0.34204172083501067, + "eval_loss": 0.6145843863487244, + "eval_mean_token_accuracy": 0.851729148559356, + "eval_num_tokens": 7627577.0, + "eval_runtime": 55.254, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.221, + "step": 2720 + }, + { + "entropy": 0.27879350669682024, + "epoch": 6.60313630880579, + "grad_norm": 0.7392159700393677, + "learning_rate": 7.41566245435424e-05, + "loss": 0.22497382164001464, + "mean_token_accuracy": 0.9250957772135735, + "num_tokens": 7682457.0, + "step": 2740 + }, + { + "epoch": 6.60313630880579, + "eval_entropy": 0.34408896290854124, + "eval_loss": 0.6155394315719604, + "eval_mean_token_accuracy": 0.8512654702984885, + "eval_num_tokens": 7682457.0, + "eval_runtime": 55.2601, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2740 + }, + { + "entropy": 0.29418626725673674, + "epoch": 6.651387213510254, + "grad_norm": 1.0366321802139282, + "learning_rate": 7.23135077411743e-05, + "loss": 0.23374652862548828, + "mean_token_accuracy": 0.9242533966898918, + "num_tokens": 7735055.0, + "step": 2760 + }, + { + "epoch": 6.651387213510254, + "eval_entropy": 0.34456085924352153, + "eval_loss": 0.6196476817131042, + "eval_mean_token_accuracy": 0.8504375420259626, + "eval_num_tokens": 7735055.0, + "eval_runtime": 55.2538, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.221, + "step": 2760 + }, + { + "entropy": 0.2746732197701931, + "epoch": 6.699638118214716, + "grad_norm": 0.7725631594657898, + "learning_rate": 7.048348266600684e-05, + "loss": 0.22313270568847657, + "mean_token_accuracy": 0.9278106808662414, + "num_tokens": 7789582.0, + "step": 2780 + }, + { + "epoch": 6.699638118214716, + "eval_entropy": 0.3446769873412807, + "eval_loss": 0.6121717095375061, + "eval_mean_token_accuracy": 0.8509121691243032, + "eval_num_tokens": 7789582.0, + "eval_runtime": 55.2595, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 2780 + }, + { + "entropy": 0.287082564458251, + "epoch": 6.74788902291918, + "grad_norm": 0.6063140630722046, + "learning_rate": 6.866706719348931e-05, + "loss": 0.22704455852508545, + "mean_token_accuracy": 0.9244498163461685, + "num_tokens": 7843628.0, + "step": 2800 + }, + { + "epoch": 6.74788902291918, + "eval_entropy": 0.34377153686593087, + "eval_loss": 0.6114247441291809, + "eval_mean_token_accuracy": 0.8516378858116236, + "eval_num_tokens": 7843628.0, + "eval_runtime": 55.2449, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 2800 + }, + { + "entropy": 0.2760728094726801, + "epoch": 6.796139927623643, + "grad_norm": 0.6179378628730774, + "learning_rate": 6.686477534771416e-05, + "loss": 0.22595617771148682, + "mean_token_accuracy": 0.9256764411926269, + "num_tokens": 7902270.0, + "step": 2820 + }, + { + "epoch": 6.796139927623643, + "eval_entropy": 0.342443875047598, + "eval_loss": 0.6084980964660645, + "eval_mean_token_accuracy": 0.8524504812915673, + "eval_num_tokens": 7902270.0, + "eval_runtime": 55.2286, + "eval_samples_per_second": 25.711, + "eval_steps_per_second": 3.223, + "step": 2820 + }, + { + "entropy": 0.27494382336735723, + "epoch": 6.844390832328106, + "grad_norm": 0.781110942363739, + "learning_rate": 6.507711715595483e-05, + "loss": 0.22353668212890626, + "mean_token_accuracy": 0.9271390274167061, + "num_tokens": 7958644.0, + "step": 2840 + }, + { + "epoch": 6.844390832328106, + "eval_entropy": 0.33768313754810375, + "eval_loss": 0.614266574382782, + "eval_mean_token_accuracy": 0.851384595538793, + "eval_num_tokens": 7958644.0, + "eval_runtime": 55.2351, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 2840 + }, + { + "entropy": 0.27367788292467593, + "epoch": 6.892641737032569, + "grad_norm": 0.836021363735199, + "learning_rate": 6.330459850433355e-05, + "loss": 0.222139310836792, + "mean_token_accuracy": 0.9269705146551133, + "num_tokens": 8014311.0, + "step": 2860 + }, + { + "epoch": 6.892641737032569, + "eval_entropy": 0.3395471740304754, + "eval_loss": 0.6169298887252808, + "eval_mean_token_accuracy": 0.8521519695105177, + "eval_num_tokens": 8014311.0, + "eval_runtime": 55.2524, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2860 + }, + { + "entropy": 0.27472747303545475, + "epoch": 6.940892641737032, + "grad_norm": 0.6614536643028259, + "learning_rate": 6.154772099466185e-05, + "loss": 0.222674560546875, + "mean_token_accuracy": 0.9267930790781975, + "num_tokens": 8074986.0, + "step": 2880 + }, + { + "epoch": 6.940892641737032, + "eval_entropy": 0.3411581661928906, + "eval_loss": 0.6118640303611755, + "eval_mean_token_accuracy": 0.8519984858759334, + "eval_num_tokens": 8074986.0, + "eval_runtime": 55.2525, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2880 + }, + { + "entropy": 0.275695338845253, + "epoch": 6.989143546441496, + "grad_norm": 0.6573458313941956, + "learning_rate": 5.980698180249315e-05, + "loss": 0.2251124620437622, + "mean_token_accuracy": 0.926266947388649, + "num_tokens": 8132041.0, + "step": 2900 + }, + { + "epoch": 6.989143546441496, + "eval_entropy": 0.34243478517184095, + "eval_loss": 0.6122242212295532, + "eval_mean_token_accuracy": 0.8515615922011687, + "eval_num_tokens": 8132041.0, + "eval_runtime": 55.2521, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 2900 + }, + { + "entropy": 0.24234489026742104, + "epoch": 7.036188178528348, + "grad_norm": 0.8545143604278564, + "learning_rate": 5.808287353642782e-05, + "loss": 0.167067813873291, + "mean_token_accuracy": 0.9460623829792707, + "num_tokens": 8188174.0, + "step": 2920 + }, + { + "epoch": 7.036188178528348, + "eval_entropy": 0.2994119189261051, + "eval_loss": 0.7004832029342651, + "eval_mean_token_accuracy": 0.8472452401445153, + "eval_num_tokens": 8188174.0, + "eval_runtime": 55.2743, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 2920 + }, + { + "entropy": 0.21194725222885608, + "epoch": 7.084439083232811, + "grad_norm": 0.7077902555465698, + "learning_rate": 5.637588409871098e-05, + "loss": 0.14837799072265626, + "mean_token_accuracy": 0.951434426009655, + "num_tokens": 8244016.0, + "step": 2940 + }, + { + "epoch": 7.084439083232811, + "eval_entropy": 0.2990778482864412, + "eval_loss": 0.6822870969772339, + "eval_mean_token_accuracy": 0.8486404402202434, + "eval_num_tokens": 8244016.0, + "eval_runtime": 55.2489, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 2940 + }, + { + "entropy": 0.20270478539168835, + "epoch": 7.132689987937274, + "grad_norm": 0.7954618334770203, + "learning_rate": 5.468649654716176e-05, + "loss": 0.14337145090103148, + "mean_token_accuracy": 0.9537484034895897, + "num_tokens": 8299545.0, + "step": 2960 + }, + { + "epoch": 7.132689987937274, + "eval_entropy": 0.30174569743737745, + "eval_loss": 0.6869224905967712, + "eval_mean_token_accuracy": 0.8480379139439443, + "eval_num_tokens": 8299545.0, + "eval_runtime": 55.2427, + "eval_samples_per_second": 25.705, + "eval_steps_per_second": 3.222, + "step": 2960 + }, + { + "entropy": 0.2108145073056221, + "epoch": 7.180940892641737, + "grad_norm": 0.6713868379592896, + "learning_rate": 5.3015188958473624e-05, + "loss": 0.14596234560012816, + "mean_token_accuracy": 0.9524013876914978, + "num_tokens": 8352460.0, + "step": 2980 + }, + { + "epoch": 7.180940892641737, + "eval_entropy": 0.2993907957766833, + "eval_loss": 0.6884537935256958, + "eval_mean_token_accuracy": 0.8481567679496294, + "eval_num_tokens": 8352460.0, + "eval_runtime": 55.2571, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 2980 + }, + { + "entropy": 0.20720747038722037, + "epoch": 7.2291917973462, + "grad_norm": 0.7463769912719727, + "learning_rate": 5.136243429292464e-05, + "loss": 0.14544438123703002, + "mean_token_accuracy": 0.9533925041556358, + "num_tokens": 8411328.0, + "step": 3000 + }, + { + "epoch": 7.2291917973462, + "eval_entropy": 0.29728540635845635, + "eval_loss": 0.6921409964561462, + "eval_mean_token_accuracy": 0.8480995625592349, + "eval_num_tokens": 8411328.0, + "eval_runtime": 55.2493, + "eval_samples_per_second": 25.702, + "eval_steps_per_second": 3.222, + "step": 3000 + }, + { + "entropy": 0.20696109160780907, + "epoch": 7.277442702050664, + "grad_norm": 0.7309594750404358, + "learning_rate": 4.972870026053484e-05, + "loss": 0.14989933967590333, + "mean_token_accuracy": 0.9511091738939286, + "num_tokens": 8466715.0, + "step": 3020 + }, + { + "epoch": 7.277442702050664, + "eval_entropy": 0.2941792637444614, + "eval_loss": 0.6908664107322693, + "eval_mean_token_accuracy": 0.8489746732658214, + "eval_num_tokens": 8466715.0, + "eval_runtime": 55.2319, + "eval_samples_per_second": 25.71, + "eval_steps_per_second": 3.223, + "step": 3020 + }, + { + "entropy": 0.20037804245948793, + "epoch": 7.325693606755126, + "grad_norm": 0.8592659831047058, + "learning_rate": 4.811444918871029e-05, + "loss": 0.1415112853050232, + "mean_token_accuracy": 0.9539519399404526, + "num_tokens": 8524635.0, + "step": 3040 + }, + { + "epoch": 7.325693606755126, + "eval_entropy": 0.29864797372831386, + "eval_loss": 0.6853435635566711, + "eval_mean_token_accuracy": 0.8484187939863527, + "eval_num_tokens": 8524635.0, + "eval_runtime": 55.2473, + "eval_samples_per_second": 25.703, + "eval_steps_per_second": 3.222, + "step": 3040 + }, + { + "entropy": 0.21042127199470997, + "epoch": 7.37394451145959, + "grad_norm": 0.734545111656189, + "learning_rate": 4.652013789140951e-05, + "loss": 0.15053329467773438, + "mean_token_accuracy": 0.9511837676167488, + "num_tokens": 8580204.0, + "step": 3060 + }, + { + "epoch": 7.37394451145959, + "eval_entropy": 0.2982367055302256, + "eval_loss": 0.6894978284835815, + "eval_mean_token_accuracy": 0.8486884664953425, + "eval_num_tokens": 8580204.0, + "eval_runtime": 55.2784, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 3060 + }, + { + "entropy": 0.210346744582057, + "epoch": 7.422195416164053, + "grad_norm": 0.8108986020088196, + "learning_rate": 4.4946217539870706e-05, + "loss": 0.14928361177444457, + "mean_token_accuracy": 0.9498722046613693, + "num_tokens": 8637335.0, + "step": 3080 + }, + { + "epoch": 7.422195416164053, + "eval_entropy": 0.2913190031821808, + "eval_loss": 0.6952372193336487, + "eval_mean_token_accuracy": 0.8496706505839744, + "eval_num_tokens": 8637335.0, + "eval_runtime": 55.2728, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 3080 + }, + { + "entropy": 0.20841738171875476, + "epoch": 7.470446320868517, + "grad_norm": 0.8051531910896301, + "learning_rate": 4.339313353493576e-05, + "loss": 0.1464880108833313, + "mean_token_accuracy": 0.9526000887155532, + "num_tokens": 8691346.0, + "step": 3100 + }, + { + "epoch": 7.470446320868517, + "eval_entropy": 0.2890907092375702, + "eval_loss": 0.7007566690444946, + "eval_mean_token_accuracy": 0.8492257896434056, + "eval_num_tokens": 8691346.0, + "eval_runtime": 55.2694, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3100 + }, + { + "entropy": 0.19933607392013072, + "epoch": 7.518697225572979, + "grad_norm": 0.7979084849357605, + "learning_rate": 4.186132538100677e-05, + "loss": 0.14407336711883545, + "mean_token_accuracy": 0.9528203010559082, + "num_tokens": 8748823.0, + "step": 3120 + }, + { + "epoch": 7.518697225572979, + "eval_entropy": 0.294091603226876, + "eval_loss": 0.6922751665115356, + "eval_mean_token_accuracy": 0.8485970986023378, + "eval_num_tokens": 8748823.0, + "eval_runtime": 55.2663, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 3120 + }, + { + "entropy": 0.19975723810493945, + "epoch": 7.566948130277443, + "grad_norm": 0.7901250720024109, + "learning_rate": 4.035122656167186e-05, + "loss": 0.14221296310424805, + "mean_token_accuracy": 0.9538118690252304, + "num_tokens": 8808835.0, + "step": 3140 + }, + { + "epoch": 7.566948130277443, + "eval_entropy": 0.2950711101293564, + "eval_loss": 0.690978467464447, + "eval_mean_token_accuracy": 0.8483095872268248, + "eval_num_tokens": 8808835.0, + "eval_runtime": 55.2705, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 3140 + }, + { + "entropy": 0.2083854541182518, + "epoch": 7.615199034981906, + "grad_norm": 0.6864560842514038, + "learning_rate": 3.886326441703407e-05, + "loss": 0.15056604146957397, + "mean_token_accuracy": 0.9521368011832237, + "num_tokens": 8864794.0, + "step": 3160 + }, + { + "epoch": 7.615199034981906, + "eval_entropy": 0.29514283318532986, + "eval_loss": 0.6886241436004639, + "eval_mean_token_accuracy": 0.8497069950184125, + "eval_num_tokens": 8864794.0, + "eval_runtime": 55.2259, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 3160 + }, + { + "entropy": 0.21473116055130959, + "epoch": 7.66344993968637, + "grad_norm": 0.7526962161064148, + "learning_rate": 3.739786002277949e-05, + "loss": 0.14960399866104127, + "mean_token_accuracy": 0.9506274402141571, + "num_tokens": 8919306.0, + "step": 3180 + }, + { + "epoch": 7.66344993968637, + "eval_entropy": 0.29309218067131687, + "eval_loss": 0.6990856528282166, + "eval_mean_token_accuracy": 0.8479215161184247, + "eval_num_tokens": 8919306.0, + "eval_runtime": 55.1684, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 3180 + }, + { + "entropy": 0.20980504602193834, + "epoch": 7.711700844390832, + "grad_norm": 0.8401498198509216, + "learning_rate": 3.5955428071017554e-05, + "loss": 0.14723907709121703, + "mean_token_accuracy": 0.9517867639660835, + "num_tokens": 8973330.0, + "step": 3200 + }, + { + "epoch": 7.711700844390832, + "eval_entropy": 0.2953076950284872, + "eval_loss": 0.6917204260826111, + "eval_mean_token_accuracy": 0.8481054918819599, + "eval_num_tokens": 8973330.0, + "eval_runtime": 55.1731, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3200 + }, + { + "entropy": 0.20456759482622147, + "epoch": 7.759951749095295, + "grad_norm": 0.7541831731796265, + "learning_rate": 3.453637675292839e-05, + "loss": 0.14354816675186158, + "mean_token_accuracy": 0.9517848521471024, + "num_tokens": 9029959.0, + "step": 3220 + }, + { + "epoch": 7.759951749095295, + "eval_entropy": 0.2932077002491844, + "eval_loss": 0.6895456910133362, + "eval_mean_token_accuracy": 0.8489299101775951, + "eval_num_tokens": 9029959.0, + "eval_runtime": 55.1675, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3220 + }, + { + "entropy": 0.2000926498323679, + "epoch": 7.808202653799759, + "grad_norm": 0.7815287113189697, + "learning_rate": 3.3141107643249166e-05, + "loss": 0.1415714740753174, + "mean_token_accuracy": 0.9534137606620788, + "num_tokens": 9088384.0, + "step": 3240 + }, + { + "epoch": 7.808202653799759, + "eval_entropy": 0.2927940240067043, + "eval_loss": 0.6980717182159424, + "eval_mean_token_accuracy": 0.8483662980326107, + "eval_num_tokens": 9088384.0, + "eval_runtime": 55.1596, + "eval_samples_per_second": 25.743, + "eval_steps_per_second": 3.227, + "step": 3240 + }, + { + "entropy": 0.2064586240798235, + "epoch": 7.856453558504222, + "grad_norm": 0.6987279653549194, + "learning_rate": 3.177001558663355e-05, + "loss": 0.1457617998123169, + "mean_token_accuracy": 0.9520302176475525, + "num_tokens": 9144987.0, + "step": 3260 + }, + { + "epoch": 7.856453558504222, + "eval_entropy": 0.29009542522135745, + "eval_loss": 0.6970582008361816, + "eval_mean_token_accuracy": 0.848495197095228, + "eval_num_tokens": 9144987.0, + "eval_runtime": 55.1791, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3260 + }, + { + "entropy": 0.20724620223045348, + "epoch": 7.904704463208685, + "grad_norm": 0.8427369594573975, + "learning_rate": 3.0423488585915043e-05, + "loss": 0.14918961524963378, + "mean_token_accuracy": 0.9520222991704941, + "num_tokens": 9198736.0, + "step": 3280 + }, + { + "epoch": 7.904704463208685, + "eval_entropy": 0.29056421843137636, + "eval_loss": 0.6968309879302979, + "eval_mean_token_accuracy": 0.8493872813964158, + "eval_num_tokens": 9198736.0, + "eval_runtime": 55.1904, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3280 + }, + { + "entropy": 0.20750712640583516, + "epoch": 7.952955367913148, + "grad_norm": 0.699073076248169, + "learning_rate": 2.910190769230703e-05, + "loss": 0.1463977336883545, + "mean_token_accuracy": 0.9525043666362762, + "num_tokens": 9254624.0, + "step": 3300 + }, + { + "epoch": 7.952955367913148, + "eval_entropy": 0.2956364156489962, + "eval_loss": 0.6904668211936951, + "eval_mean_token_accuracy": 0.8487593923391921, + "eval_num_tokens": 9254624.0, + "eval_runtime": 55.1975, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3300 + }, + { + "entropy": 0.2009814847738315, + "epoch": 8.0, + "grad_norm": 2.192417621612549, + "learning_rate": 2.7805646897569558e-05, + "loss": 0.14293937683105468, + "mean_token_accuracy": 0.95244728296231, + "num_tokens": 9309504.0, + "step": 3320 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2898739650008384, + "eval_loss": 0.6985539793968201, + "eval_mean_token_accuracy": 0.8486044373405114, + "eval_num_tokens": 9309504.0, + "eval_runtime": 55.1874, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 3320 + }, + { + "entropy": 0.1747811622917652, + "epoch": 8.048250904704464, + "grad_norm": 0.6563359498977661, + "learning_rate": 2.653507302817429e-05, + "loss": 0.09911853075027466, + "mean_token_accuracy": 0.9703438818454743, + "num_tokens": 9363278.0, + "step": 3340 + }, + { + "epoch": 8.048250904704464, + "eval_entropy": 0.2652904349431563, + "eval_loss": 0.7738804817199707, + "eval_mean_token_accuracy": 0.8466940669531233, + "eval_num_tokens": 9363278.0, + "eval_runtime": 55.1838, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3340 + }, + { + "entropy": 0.15915328189730643, + "epoch": 8.096501809408926, + "grad_norm": 0.7250285744667053, + "learning_rate": 2.5290545641496805e-05, + "loss": 0.09311577081680297, + "mean_token_accuracy": 0.9713594883680343, + "num_tokens": 9418505.0, + "step": 3360 + }, + { + "epoch": 8.096501809408926, + "eval_entropy": 0.26534568283999904, + "eval_loss": 0.7698941826820374, + "eval_mean_token_accuracy": 0.8466719904642427, + "eval_num_tokens": 9418505.0, + "eval_runtime": 55.1968, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3360 + }, + { + "entropy": 0.1663580035790801, + "epoch": 8.14475271411339, + "grad_norm": 0.6403616070747375, + "learning_rate": 2.4072416924066163e-05, + "loss": 0.1001995325088501, + "mean_token_accuracy": 0.9687129512429238, + "num_tokens": 9473735.0, + "step": 3380 + }, + { + "epoch": 8.14475271411339, + "eval_entropy": 0.2670054007949454, + "eval_loss": 0.7662967443466187, + "eval_mean_token_accuracy": 0.846786938021692, + "eval_num_tokens": 9473735.0, + "eval_runtime": 55.1841, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3380 + }, + { + "entropy": 0.15221141315996647, + "epoch": 8.193003618817853, + "grad_norm": 0.7957155108451843, + "learning_rate": 2.2881031591900387e-05, + "loss": 0.08914719820022583, + "mean_token_accuracy": 0.9729975983500481, + "num_tokens": 9534946.0, + "step": 3400 + }, + { + "epoch": 8.193003618817853, + "eval_entropy": 0.26733438310663354, + "eval_loss": 0.7707550525665283, + "eval_mean_token_accuracy": 0.8461319924740309, + "eval_num_tokens": 9534946.0, + "eval_runtime": 55.1757, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3400 + }, + { + "entropy": 0.15542738791555166, + "epoch": 8.241254523522317, + "grad_norm": 0.5750007629394531, + "learning_rate": 2.171672679295568e-05, + "loss": 0.09130602478981018, + "mean_token_accuracy": 0.9710182502865792, + "num_tokens": 9596302.0, + "step": 3420 + }, + { + "epoch": 8.241254523522317, + "eval_entropy": 0.26393357898746983, + "eval_loss": 0.7720436453819275, + "eval_mean_token_accuracy": 0.8474090959918633, + "eval_num_tokens": 9596302.0, + "eval_runtime": 55.1825, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3420 + }, + { + "entropy": 0.16351462248712778, + "epoch": 8.289505428226779, + "grad_norm": 0.7309626340866089, + "learning_rate": 2.057983201171781e-05, + "loss": 0.0985127031803131, + "mean_token_accuracy": 0.9697432905435562, + "num_tokens": 9646329.0, + "step": 3440 + }, + { + "epoch": 8.289505428226779, + "eval_entropy": 0.2609939365072197, + "eval_loss": 0.7792959213256836, + "eval_mean_token_accuracy": 0.8471295776661862, + "eval_num_tokens": 9646329.0, + "eval_runtime": 55.194, + "eval_samples_per_second": 25.727, + "eval_steps_per_second": 3.225, + "step": 3440 + }, + { + "entropy": 0.1608191981911659, + "epoch": 8.337756332931242, + "grad_norm": 0.6144416928291321, + "learning_rate": 1.947066897596166e-05, + "loss": 0.09536871314048767, + "mean_token_accuracy": 0.9700575843453407, + "num_tokens": 9703269.0, + "step": 3460 + }, + { + "epoch": 8.337756332931242, + "eval_entropy": 0.2646808089332634, + "eval_loss": 0.7733453512191772, + "eval_mean_token_accuracy": 0.8464634428533275, + "eval_num_tokens": 9703269.0, + "eval_runtime": 55.1981, + "eval_samples_per_second": 25.726, + "eval_steps_per_second": 3.225, + "step": 3460 + }, + { + "entropy": 0.15692227762192487, + "epoch": 8.386007237635706, + "grad_norm": 0.7023665308952332, + "learning_rate": 1.8389551565706204e-05, + "loss": 0.0954119086265564, + "mean_token_accuracy": 0.9701876968145371, + "num_tokens": 9760710.0, + "step": 3480 + }, + { + "epoch": 8.386007237635706, + "eval_entropy": 0.25970463814695227, + "eval_loss": 0.7781035304069519, + "eval_mean_token_accuracy": 0.8474177078584607, + "eval_num_tokens": 9760710.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3480 + }, + { + "entropy": 0.16045771054923536, + "epoch": 8.43425814234017, + "grad_norm": 0.6111961007118225, + "learning_rate": 1.7336785724390205e-05, + "loss": 0.09789881110191345, + "mean_token_accuracy": 0.969735924899578, + "num_tokens": 9816150.0, + "step": 3500 + }, + { + "epoch": 8.43425814234017, + "eval_entropy": 0.26607431437862056, + "eval_loss": 0.7712005972862244, + "eval_mean_token_accuracy": 0.8459033410200912, + "eval_num_tokens": 9816150.0, + "eval_runtime": 55.1744, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3500 + }, + { + "entropy": 0.1559722937643528, + "epoch": 8.482509047044632, + "grad_norm": 0.6566210389137268, + "learning_rate": 1.6312669372293666e-05, + "loss": 0.09393646121025086, + "mean_token_accuracy": 0.9715360820293426, + "num_tokens": 9871871.0, + "step": 3520 + }, + { + "epoch": 8.482509047044632, + "eval_entropy": 0.2609174002924662, + "eval_loss": 0.7844049334526062, + "eval_mean_token_accuracy": 0.8466568261050107, + "eval_num_tokens": 9871871.0, + "eval_runtime": 55.1787, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 3520 + }, + { + "entropy": 0.1532884443178773, + "epoch": 8.530759951749095, + "grad_norm": 0.6216753721237183, + "learning_rate": 1.531749232223018e-05, + "loss": 0.09293950200080872, + "mean_token_accuracy": 0.9714278027415275, + "num_tokens": 9928709.0, + "step": 3540 + }, + { + "epoch": 8.530759951749095, + "eval_entropy": 0.2630744833457336, + "eval_loss": 0.7802227139472961, + "eval_mean_token_accuracy": 0.846255295062333, + "eval_num_tokens": 9928709.0, + "eval_runtime": 55.1631, + "eval_samples_per_second": 25.742, + "eval_steps_per_second": 3.227, + "step": 3540 + }, + { + "entropy": 0.1553487192839384, + "epoch": 8.579010856453559, + "grad_norm": 0.7456594705581665, + "learning_rate": 1.4351536197533074e-05, + "loss": 0.09566901326179504, + "mean_token_accuracy": 0.97108214199543, + "num_tokens": 9984558.0, + "step": 3560 + }, + { + "epoch": 8.579010856453559, + "eval_entropy": 0.26115024215384813, + "eval_loss": 0.7809329628944397, + "eval_mean_token_accuracy": 0.8467723797546344, + "eval_num_tokens": 9984558.0, + "eval_runtime": 55.148, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3560 + }, + { + "entropy": 0.1595249420031905, + "epoch": 8.627261761158021, + "grad_norm": 0.7340289950370789, + "learning_rate": 1.3415074352359433e-05, + "loss": 0.09668846726417542, + "mean_token_accuracy": 0.9696339756250382, + "num_tokens": 10039763.0, + "step": 3580 + }, + { + "epoch": 8.627261761158021, + "eval_entropy": 0.2647377721379312, + "eval_loss": 0.7739617228507996, + "eval_mean_token_accuracy": 0.8465174942204122, + "eval_num_tokens": 10039763.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3580 + }, + { + "entropy": 0.1583833245560527, + "epoch": 8.675512665862485, + "grad_norm": 0.6431241631507874, + "learning_rate": 1.2508371794334224e-05, + "loss": 0.09521735310554505, + "mean_token_accuracy": 0.9699321657419204, + "num_tokens": 10095098.0, + "step": 3600 + }, + { + "epoch": 8.675512665862485, + "eval_entropy": 0.2621892829624455, + "eval_loss": 0.7764760255813599, + "eval_mean_token_accuracy": 0.8469243655713756, + "eval_num_tokens": 10095098.0, + "eval_runtime": 55.1393, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 3.228, + "step": 3600 + }, + { + "entropy": 0.15372174456715584, + "epoch": 8.723763570566948, + "grad_norm": 0.632522702217102, + "learning_rate": 1.163168510955608e-05, + "loss": 0.09367120265960693, + "mean_token_accuracy": 0.9709951281547546, + "num_tokens": 10152831.0, + "step": 3620 + }, + { + "epoch": 8.723763570566948, + "eval_entropy": 0.26019893243406594, + "eval_loss": 0.7842312455177307, + "eval_mean_token_accuracy": 0.8465140887190786, + "eval_num_tokens": 10152831.0, + "eval_runtime": 55.1348, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3620 + }, + { + "entropy": 0.15217966660857202, + "epoch": 8.772014475271412, + "grad_norm": 0.5612668395042419, + "learning_rate": 1.078526238998661e-05, + "loss": 0.09193292260169983, + "mean_token_accuracy": 0.9713213533163071, + "num_tokens": 10209639.0, + "step": 3640 + }, + { + "epoch": 8.772014475271412, + "eval_entropy": 0.26003232162989925, + "eval_loss": 0.7822859883308411, + "eval_mean_token_accuracy": 0.8468798703691932, + "eval_num_tokens": 10209639.0, + "eval_runtime": 55.1916, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 3640 + }, + { + "entropy": 0.15681463126093148, + "epoch": 8.820265379975874, + "grad_norm": 0.5372915267944336, + "learning_rate": 9.969343163243224e-06, + "loss": 0.09630222916603089, + "mean_token_accuracy": 0.9702456504106521, + "num_tokens": 10263716.0, + "step": 3660 + }, + { + "epoch": 8.820265379975874, + "eval_entropy": 0.2598928563882796, + "eval_loss": 0.7829110026359558, + "eval_mean_token_accuracy": 0.8469086342983032, + "eval_num_tokens": 10263716.0, + "eval_runtime": 55.1354, + "eval_samples_per_second": 25.755, + "eval_steps_per_second": 3.228, + "step": 3660 + }, + { + "entropy": 0.1535819811746478, + "epoch": 8.868516284680338, + "grad_norm": 0.575184166431427, + "learning_rate": 9.184158324815683e-06, + "loss": 0.09521135687828064, + "mean_token_accuracy": 0.9696467980742455, + "num_tokens": 10320171.0, + "step": 3680 + }, + { + "epoch": 8.868516284680338, + "eval_entropy": 0.2629028752948461, + "eval_loss": 0.7783747911453247, + "eval_mean_token_accuracy": 0.8463931284593732, + "eval_num_tokens": 10320171.0, + "eval_runtime": 55.1499, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3680 + }, + { + "entropy": 0.15672272052615882, + "epoch": 8.916767189384801, + "grad_norm": 0.610146701335907, + "learning_rate": 8.429930072725457e-06, + "loss": 0.09335047006607056, + "mean_token_accuracy": 0.9709506019949913, + "num_tokens": 10377086.0, + "step": 3700 + }, + { + "epoch": 8.916767189384801, + "eval_entropy": 0.26106021611878044, + "eval_loss": 0.7824530005455017, + "eval_mean_token_accuracy": 0.8467274777005228, + "eval_num_tokens": 10377086.0, + "eval_runtime": 55.1677, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3700 + }, + { + "entropy": 0.15496804118156432, + "epoch": 8.965018094089265, + "grad_norm": 0.8847843408584595, + "learning_rate": 7.706871844646178e-06, + "loss": 0.09552072882652282, + "mean_token_accuracy": 0.9693930178880692, + "num_tokens": 10435272.0, + "step": 3720 + }, + { + "epoch": 8.965018094089265, + "eval_entropy": 0.26175538701622675, + "eval_loss": 0.7833470106124878, + "eval_mean_token_accuracy": 0.8464220507761065, + "eval_num_tokens": 10435272.0, + "eval_runtime": 55.1496, + "eval_samples_per_second": 25.748, + "eval_steps_per_second": 3.228, + "step": 3720 + }, + { + "entropy": 0.15739625711471605, + "epoch": 9.012062726176115, + "grad_norm": 0.35802221298217773, + "learning_rate": 7.0151882575034775e-06, + "loss": 0.09389110803604125, + "mean_token_accuracy": 0.9715636464265677, + "num_tokens": 10487076.0, + "step": 3740 + }, + { + "epoch": 9.012062726176115, + "eval_entropy": 0.26084648198291155, + "eval_loss": 0.7853291034698486, + "eval_mean_token_accuracy": 0.8463665585169632, + "eval_num_tokens": 10487076.0, + "eval_runtime": 55.1572, + "eval_samples_per_second": 25.745, + "eval_steps_per_second": 3.227, + "step": 3740 + }, + { + "entropy": 0.14215838070958853, + "epoch": 9.060313630880579, + "grad_norm": 0.4684004783630371, + "learning_rate": 6.35507504957069e-06, + "loss": 0.07583575248718262, + "mean_token_accuracy": 0.9782516390085221, + "num_tokens": 10543520.0, + "step": 3760 + }, + { + "epoch": 9.060313630880579, + "eval_entropy": 0.254280548333452, + "eval_loss": 0.807111382484436, + "eval_mean_token_accuracy": 0.8463215992022096, + "eval_num_tokens": 10543520.0, + "eval_runtime": 55.1805, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 3760 + }, + { + "entropy": 0.1408092312514782, + "epoch": 9.108564535585042, + "grad_norm": 0.43550431728363037, + "learning_rate": 5.726719025077231e-06, + "loss": 0.07781847715377807, + "mean_token_accuracy": 0.9768255725502968, + "num_tokens": 10596528.0, + "step": 3780 + }, + { + "epoch": 9.108564535585042, + "eval_entropy": 0.2505798229340757, + "eval_loss": 0.8224650025367737, + "eval_mean_token_accuracy": 0.8463811020502884, + "eval_num_tokens": 10596528.0, + "eval_runtime": 55.1754, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 3780 + }, + { + "entropy": 0.13897503707557918, + "epoch": 9.156815440289506, + "grad_norm": 0.5255228281021118, + "learning_rate": 5.130298001345343e-06, + "loss": 0.07528382539749146, + "mean_token_accuracy": 0.9768329098820686, + "num_tokens": 10655544.0, + "step": 3800 + }, + { + "epoch": 9.156815440289506, + "eval_entropy": 0.2509520294960965, + "eval_loss": 0.8247353434562683, + "eval_mean_token_accuracy": 0.845786559112956, + "eval_num_tokens": 10655544.0, + "eval_runtime": 55.151, + "eval_samples_per_second": 25.747, + "eval_steps_per_second": 3.228, + "step": 3800 + }, + { + "entropy": 0.13998530581593513, + "epoch": 9.205066344993968, + "grad_norm": 0.4610442817211151, + "learning_rate": 4.565980758469731e-06, + "loss": 0.07813523411750793, + "mean_token_accuracy": 0.9769201070070267, + "num_tokens": 10710737.0, + "step": 3820 + }, + { + "epoch": 9.205066344993968, + "eval_entropy": 0.2503350620691696, + "eval_loss": 0.8271610736846924, + "eval_mean_token_accuracy": 0.8455204879969693, + "eval_num_tokens": 10710737.0, + "eval_runtime": 55.1706, + "eval_samples_per_second": 25.738, + "eval_steps_per_second": 3.226, + "step": 3820 + }, + { + "entropy": 0.13162653651088477, + "epoch": 9.253317249698432, + "grad_norm": 0.3627360463142395, + "learning_rate": 4.033926991554922e-06, + "loss": 0.07141604423522949, + "mean_token_accuracy": 0.9792275875806808, + "num_tokens": 10771727.0, + "step": 3840 + }, + { + "epoch": 9.253317249698432, + "eval_entropy": 0.24897396087311627, + "eval_loss": 0.8307036757469177, + "eval_mean_token_accuracy": 0.8453606835911783, + "eval_num_tokens": 10771727.0, + "eval_runtime": 55.168, + "eval_samples_per_second": 25.74, + "eval_steps_per_second": 3.227, + "step": 3840 + }, + { + "entropy": 0.14422765467315912, + "epoch": 9.301568154402895, + "grad_norm": 0.5706251263618469, + "learning_rate": 3.53428726552335e-06, + "loss": 0.07931464314460754, + "mean_token_accuracy": 0.9752222061157226, + "num_tokens": 10825849.0, + "step": 3860 + }, + { + "epoch": 9.301568154402895, + "eval_entropy": 0.24914598389622872, + "eval_loss": 0.8312752842903137, + "eval_mean_token_accuracy": 0.8455510604917333, + "eval_num_tokens": 10825849.0, + "eval_runtime": 55.1849, + "eval_samples_per_second": 25.732, + "eval_steps_per_second": 3.226, + "step": 3860 + }, + { + "entropy": 0.13540438804775476, + "epoch": 9.349819059107359, + "grad_norm": 0.6334630250930786, + "learning_rate": 3.0672029725073196e-06, + "loss": 0.07548695802688599, + "mean_token_accuracy": 0.9778543844819069, + "num_tokens": 10884611.0, + "step": 3880 + }, + { + "epoch": 9.349819059107359, + "eval_entropy": 0.24910795722114906, + "eval_loss": 0.8316059112548828, + "eval_mean_token_accuracy": 0.8456973557391864, + "eval_num_tokens": 10884611.0, + "eval_runtime": 55.1728, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3880 + }, + { + "entropy": 0.1410717975348234, + "epoch": 9.398069963811821, + "grad_norm": 0.5009909868240356, + "learning_rate": 2.632806291836666e-06, + "loss": 0.07720760703086853, + "mean_token_accuracy": 0.9768648758530617, + "num_tokens": 10938300.0, + "step": 3900 + }, + { + "epoch": 9.398069963811821, + "eval_entropy": 0.24874219742048992, + "eval_loss": 0.8328408598899841, + "eval_mean_token_accuracy": 0.8455627888775943, + "eval_num_tokens": 10938300.0, + "eval_runtime": 55.1736, + "eval_samples_per_second": 25.737, + "eval_steps_per_second": 3.226, + "step": 3900 + }, + { + "entropy": 0.13674762714654207, + "epoch": 9.446320868516285, + "grad_norm": 0.46003177762031555, + "learning_rate": 2.231220152633621e-06, + "loss": 0.07583877444267273, + "mean_token_accuracy": 0.9772356480360032, + "num_tokens": 10994442.0, + "step": 3920 + }, + { + "epoch": 9.446320868516285, + "eval_entropy": 0.24896439030933915, + "eval_loss": 0.8331801891326904, + "eval_mean_token_accuracy": 0.8453999262177543, + "eval_num_tokens": 10994442.0, + "eval_runtime": 55.1815, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 3920 + }, + { + "entropy": 0.1378044320270419, + "epoch": 9.494571773220748, + "grad_norm": 0.5056300759315491, + "learning_rate": 1.862558199025263e-06, + "loss": 0.07420622110366822, + "mean_token_accuracy": 0.9771117404103279, + "num_tokens": 11052708.0, + "step": 3940 + }, + { + "epoch": 9.494571773220748, + "eval_entropy": 0.2490867761413703, + "eval_loss": 0.8338391780853271, + "eval_mean_token_accuracy": 0.8453461571355884, + "eval_num_tokens": 11052708.0, + "eval_runtime": 59.1613, + "eval_samples_per_second": 24.002, + "eval_steps_per_second": 3.009, + "step": 3940 + }, + { + "entropy": 0.13731490727514029, + "epoch": 9.54282267792521, + "grad_norm": 0.5770965218544006, + "learning_rate": 1.5269247579836162e-06, + "loss": 0.07547505497932434, + "mean_token_accuracy": 0.9772329092025757, + "num_tokens": 11106761.0, + "step": 3960 + }, + { + "epoch": 9.54282267792521, + "eval_entropy": 0.24866497918461147, + "eval_loss": 0.8349169492721558, + "eval_mean_token_accuracy": 0.8455062398080075, + "eval_num_tokens": 11106761.0, + "eval_runtime": 55.1484, + "eval_samples_per_second": 25.749, + "eval_steps_per_second": 3.228, + "step": 3960 + }, + { + "entropy": 0.13437952492386102, + "epoch": 9.591073582629674, + "grad_norm": 0.5746839046478271, + "learning_rate": 1.2244148098023241e-06, + "loss": 0.0719214141368866, + "mean_token_accuracy": 0.9783813208341599, + "num_tokens": 11163241.0, + "step": 3980 + }, + { + "epoch": 9.591073582629674, + "eval_entropy": 0.2487325757909357, + "eval_loss": 0.8345232009887695, + "eval_mean_token_accuracy": 0.8452854909923639, + "eval_num_tokens": 11163241.0, + "eval_runtime": 55.2029, + "eval_samples_per_second": 25.723, + "eval_steps_per_second": 3.224, + "step": 3980 + }, + { + "entropy": 0.1449177075177431, + "epoch": 9.639324487334138, + "grad_norm": 0.7029407024383545, + "learning_rate": 9.551139612183896e-07, + "loss": 0.08021060228347779, + "mean_token_accuracy": 0.9748102590441704, + "num_tokens": 11215311.0, + "step": 4000 + }, + { + "epoch": 9.639324487334138, + "eval_entropy": 0.24849342898036655, + "eval_loss": 0.8350681066513062, + "eval_mean_token_accuracy": 0.8454755872822879, + "eval_num_tokens": 11215311.0, + "eval_runtime": 55.1818, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 4000 + }, + { + "entropy": 0.13718088436871767, + "epoch": 9.687575392038601, + "grad_norm": 0.4247730076313019, + "learning_rate": 7.190984211864178e-07, + "loss": 0.0763831913471222, + "mean_token_accuracy": 0.9777001023292542, + "num_tokens": 11272587.0, + "step": 4020 + }, + { + "epoch": 9.687575392038601, + "eval_entropy": 0.2485166782241189, + "eval_loss": 0.8347740769386292, + "eval_mean_token_accuracy": 0.8455832382936156, + "eval_num_tokens": 11272587.0, + "eval_runtime": 55.1819, + "eval_samples_per_second": 25.733, + "eval_steps_per_second": 3.226, + "step": 4020 + }, + { + "entropy": 0.1414831655099988, + "epoch": 9.735826296743063, + "grad_norm": 0.4344032108783722, + "learning_rate": 5.164349793124746e-07, + "loss": 0.0786937952041626, + "mean_token_accuracy": 0.9762448608875275, + "num_tokens": 11326845.0, + "step": 4040 + }, + { + "epoch": 9.735826296743063, + "eval_entropy": 0.2485147834326444, + "eval_loss": 0.8348782658576965, + "eval_mean_token_accuracy": 0.8454727480250798, + "eval_num_tokens": 11326845.0, + "eval_runtime": 55.1896, + "eval_samples_per_second": 25.729, + "eval_steps_per_second": 3.225, + "step": 4040 + }, + { + "entropy": 0.1397345969453454, + "epoch": 9.784077201447527, + "grad_norm": 0.5865362882614136, + "learning_rate": 3.4718098695330847e-07, + "loss": 0.07839923501014709, + "mean_token_accuracy": 0.9766460061073303, + "num_tokens": 11381321.0, + "step": 4060 + }, + { + "epoch": 9.784077201447527, + "eval_entropy": 0.24835219778371662, + "eval_loss": 0.8349990248680115, + "eval_mean_token_accuracy": 0.8452944608216875, + "eval_num_tokens": 11381321.0, + "eval_runtime": 55.1862, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 4060 + }, + { + "entropy": 0.13877951726317406, + "epoch": 9.83232810615199, + "grad_norm": 0.36521604657173157, + "learning_rate": 2.1138434098667948e-07, + "loss": 0.0738587200641632, + "mean_token_accuracy": 0.9764896467328071, + "num_tokens": 11441968.0, + "step": 4080 + }, + { + "epoch": 9.83232810615199, + "eval_entropy": 0.24834532483240193, + "eval_loss": 0.8350111246109009, + "eval_mean_token_accuracy": 0.8456257598453694, + "eval_num_tokens": 11441968.0, + "eval_runtime": 55.1871, + "eval_samples_per_second": 25.731, + "eval_steps_per_second": 3.225, + "step": 4080 + }, + { + "entropy": 0.1345276204869151, + "epoch": 9.880579010856454, + "grad_norm": 0.45626401901245117, + "learning_rate": 1.0908347025708512e-07, + "loss": 0.07468653917312622, + "mean_token_accuracy": 0.978096280992031, + "num_tokens": 11500487.0, + "step": 4100 + }, + { + "epoch": 9.880579010856454, + "eval_entropy": 0.2485178895713238, + "eval_loss": 0.834865152835846, + "eval_mean_token_accuracy": 0.8453632285085957, + "eval_num_tokens": 11500487.0, + "eval_runtime": 55.1746, + "eval_samples_per_second": 25.736, + "eval_steps_per_second": 3.226, + "step": 4100 + }, + { + "entropy": 0.1314420524984598, + "epoch": 9.928829915560916, + "grad_norm": 0.5756514072418213, + "learning_rate": 4.0307324700819896e-08, + "loss": 0.07114983201026917, + "mean_token_accuracy": 0.9784522473812103, + "num_tokens": 11562246.0, + "step": 4120 + }, + { + "epoch": 9.928829915560916, + "eval_entropy": 0.24849412364236426, + "eval_loss": 0.8347920775413513, + "eval_mean_token_accuracy": 0.8454210158814205, + "eval_num_tokens": 11562246.0, + "eval_runtime": 55.179, + "eval_samples_per_second": 25.734, + "eval_steps_per_second": 3.226, + "step": 4120 + }, + { + "entropy": 0.14091254398226738, + "epoch": 9.97708082026538, + "grad_norm": 0.4619421064853668, + "learning_rate": 5.075367153567275e-09, + "loss": 0.07807959914207459, + "mean_token_accuracy": 0.9760556846857071, + "num_tokens": 11614714.0, + "step": 4140 + }, + { + "epoch": 9.97708082026538, + "eval_entropy": 0.24850971368926295, + "eval_loss": 0.8348681926727295, + "eval_mean_token_accuracy": 0.8453842609116201, + "eval_num_tokens": 11614714.0, + "eval_runtime": 55.1689, + "eval_samples_per_second": 25.739, + "eval_steps_per_second": 3.226, + "step": 4140 + }, + { + "epoch": 10.0, + "eval_entropy": 0.24856727210323462, + "eval_loss": 0.8349125981330872, + "eval_mean_token_accuracy": 0.845321658622013, + "eval_num_tokens": 11636880.0, + "eval_runtime": 55.1783, + "eval_samples_per_second": 25.735, + "eval_steps_per_second": 3.226, + "step": 4150 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.261265766344397e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..90a9c6e3444933b3c7e7ca9567aee009975a8146 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/trainer_state.json @@ -0,0 +1,475 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0120627261761157, + "eval_steps": 20, + "global_step": 420, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.382273534580736e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ba23f16881283d95f36108cf78155065e66a2ac7 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/trainer_state.json @@ -0,0 +1,496 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.060313630880579, + "eval_steps": 20, + "global_step": 440, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.72996732017664e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..813641bb5522c0e32518f53c4956f38379bbbe9b --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/trainer_state.json @@ -0,0 +1,517 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.1085645355850422, + "eval_steps": 20, + "global_step": 460, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.006665118169088e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5fb3abae6eea8669a2c871680f0aed05f0b7d99c --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/trainer_state.json @@ -0,0 +1,538 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.1568154402895054, + "eval_steps": 20, + "global_step": 480, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.315077038536704e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e51b9fdd3ea76614dfd44170dc1fab4fdf2a540a --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/trainer_state.json @@ -0,0 +1,559 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.2050663449939687, + "eval_steps": 20, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.607755093530624e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3fb50fe9c74fa4c5f633d0fa3b94b5b8b6971d3e --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/trainer_state.json @@ -0,0 +1,580 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.2533172496984317, + "eval_steps": 20, + "global_step": 520, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.924245139408896e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f571fef1acb4185eff2bfc4ed4ca792f2bcb0983 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/trainer_state.json @@ -0,0 +1,601 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.301568154402895, + "eval_steps": 20, + "global_step": 540, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.21236494928896e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..97544a70fc9fbd21e09e293463ad18beb39a77ef --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/trainer_state.json @@ -0,0 +1,622 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.3498190591073582, + "eval_steps": 20, + "global_step": 560, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.496366499104768e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a620bdb656275804d094dd1cf94d256ed5b8a4d4 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/trainer_state.json @@ -0,0 +1,643 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.3980699638118215, + "eval_steps": 20, + "global_step": 580, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.765813343479808e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..184c4bc7ecc4f784378b999f8dc59337df51e9f6 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/trainer_state.json @@ -0,0 +1,97 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.14475271411338964, + "eval_steps": 20, + "global_step": 60, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9041516775260160.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d013c381f0359ab5f2f50d9de4dc29d05d8bc829 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/trainer_state.json @@ -0,0 +1,664 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.4463208685162847, + "eval_steps": 20, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.075386824378368e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..770899b33c6f0f688b9a7c7b7ab9d3a6676d5a2b --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/trainer_state.json @@ -0,0 +1,685 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.494571773220748, + "eval_steps": 20, + "global_step": 620, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.386984236505088e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..460eb0a3357e956951ba9e93b0950e4c8e55493d --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/trainer_state.json @@ -0,0 +1,706 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.5428226779252112, + "eval_steps": 20, + "global_step": 640, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.707029361584128e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8546d9c99f77a64072dcdaf9b2b7be8128059aab --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/trainer_state.json @@ -0,0 +1,727 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.5910735826296745, + "eval_steps": 20, + "global_step": 660, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.980119282169856e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ea6e763df688a23d907c3df6ff73ce9bc49e55df --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/trainer_state.json @@ -0,0 +1,748 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.6393244873341375, + "eval_steps": 20, + "global_step": 680, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0277232386463744e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7e884845471eab30b301e0fdb3b299fe37f4eb42 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/trainer_state.json @@ -0,0 +1,769 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.6875753920386007, + "eval_steps": 20, + "global_step": 700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0595024788088832e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d1cd89aa7f4b3f153ddb97886abfb1a13113ac2b --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/trainer_state.json @@ -0,0 +1,790 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.7358262967430638, + "eval_steps": 20, + "global_step": 720, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0932475721730048e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..099490c231985176cc60491bf43a23915e0a5696 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/trainer_state.json @@ -0,0 +1,811 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.784077201447527, + "eval_steps": 20, + "global_step": 740, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1222584264034304e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5cc5732a33fed3a1fb32e79319fe0ede732e4c11 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/trainer_state.json @@ -0,0 +1,832 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.8323281061519903, + "eval_steps": 20, + "global_step": 760, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1512446414710784e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4be3e98ced31815835d1721379b21388bf6da454 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/trainer_state.json @@ -0,0 +1,853 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.8805790108564535, + "eval_steps": 20, + "global_step": 780, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1817479249897472e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..aeabe043c7cd1b46bfad5a32f9f42aab80dc4994 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/trainer_state.json @@ -0,0 +1,118 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.19300361881785283, + "eval_steps": 20, + "global_step": 80, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.195122590527488e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..52d804d978766fcfd9d91001672cf3e385b5559c --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/trainer_state.json @@ -0,0 +1,874 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9288299155609168, + "eval_steps": 20, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2128478282356736e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e64b2b37b3bdc6bc8eb99a47435cf6a5416f2bae --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/trainer_state.json @@ -0,0 +1,895 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.97708082026538, + "eval_steps": 20, + "global_step": 820, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2408379171510272e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7e0202d5751b69438f8eaa92075e047a227809 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/trainer_state.json @@ -0,0 +1,916 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0241254523522314, + "eval_steps": 20, + "global_step": 840, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2729755251364352e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5b8b329d58cd5ae23e21dc98ea213c32d10324d3 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/trainer_state.json @@ -0,0 +1,937 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0723763570566947, + "eval_steps": 20, + "global_step": 860, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3039258334654976e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..aa8a7b7661ff2212a0cfde7d18bc058981ff4a1f --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/trainer_state.json @@ -0,0 +1,958 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.120627261761158, + "eval_steps": 20, + "global_step": 880, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3343323202732544e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..40be38f6b2c755c2b88b10ac556d7dcfab2594df --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/trainer_state.json @@ -0,0 +1,979 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.168878166465621, + "eval_steps": 20, + "global_step": 900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3670091299369472e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..891850448a21cda608d0e17083f5c46487d08b81 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/trainer_state.json @@ -0,0 +1,1000 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.2171290711700844, + "eval_steps": 20, + "global_step": 920, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.399008362624256e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..364af36909ee2ab7acda1c5db08c26f8de783f37 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/trainer_state.json @@ -0,0 +1,1021 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.2653799758745476, + "eval_steps": 20, + "global_step": 940, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.4265179878654464e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fde0cc5d08e2279da33811c776b71cfe829c1796 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/trainer_state.json @@ -0,0 +1,1042 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.313630880579011, + "eval_steps": 20, + "global_step": 960, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.45455207528576e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen3-4B-Base +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen3-4B-Base +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen3-4B-Base", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.010466836799929592, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1f15a6139e0305ce3a401671fa1d4de3f3729793 --- /dev/null +++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/trainer_state.json @@ -0,0 +1,1063 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.361881785283474, + "eval_steps": 20, + "global_step": 980, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.6116197913885117, + "epoch": 0.04825090470446321, + "grad_norm": 1.1876262426376343, + "learning_rate": 1.0857557926642104e-05, + "loss": 2.5340471267700195, + "mean_token_accuracy": 0.5611534088850021, + "num_tokens": 57775.0, + "step": 20 + }, + { + "epoch": 0.04825090470446321, + "eval_entropy": 2.590608079781693, + "eval_loss": 2.5703318119049072, + "eval_mean_token_accuracy": 0.564014436488741, + "eval_num_tokens": 57775.0, + "eval_runtime": 55.7007, + "eval_samples_per_second": 25.493, + "eval_steps_per_second": 3.196, + "step": 20 + }, + { + "entropy": 2.440599513053894, + "epoch": 0.09650180940892641, + "grad_norm": 0.6993194818496704, + "learning_rate": 2.22865662704759e-05, + "loss": 2.3795186996459963, + "mean_token_accuracy": 0.5858899913728237, + "num_tokens": 116416.0, + "step": 40 + }, + { + "epoch": 0.09650180940892641, + "eval_entropy": 2.2737208745452797, + "eval_loss": 2.2082855701446533, + "eval_mean_token_accuracy": 0.6068195993645807, + "eval_num_tokens": 116416.0, + "eval_runtime": 55.3085, + "eval_samples_per_second": 25.674, + "eval_steps_per_second": 3.218, + "step": 40 + }, + { + "entropy": 1.9993192434310914, + "epoch": 0.14475271411338964, + "grad_norm": 1.542217493057251, + "learning_rate": 3.37155746143097e-05, + "loss": 1.8905294418334961, + "mean_token_accuracy": 0.63250722438097, + "num_tokens": 173175.0, + "step": 60 + }, + { + "epoch": 0.14475271411338964, + "eval_entropy": 1.6427591362696015, + "eval_loss": 1.5519300699234009, + "eval_mean_token_accuracy": 0.6690792203619239, + "eval_num_tokens": 173175.0, + "eval_runtime": 55.3032, + "eval_samples_per_second": 25.677, + "eval_steps_per_second": 3.219, + "step": 60 + }, + { + "entropy": 1.3025753945112228, + "epoch": 0.19300361881785283, + "grad_norm": 1.1189110279083252, + "learning_rate": 4.514458295814349e-05, + "loss": 1.2444252967834473, + "mean_token_accuracy": 0.7070476695895195, + "num_tokens": 227007.0, + "step": 80 + }, + { + "epoch": 0.19300361881785283, + "eval_entropy": 1.047531166773164, + "eval_loss": 1.01227867603302, + "eval_mean_token_accuracy": 0.7457922345466828, + "eval_num_tokens": 227007.0, + "eval_runtime": 55.28, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 80 + }, + { + "entropy": 0.9481177270412445, + "epoch": 0.24125452352231605, + "grad_norm": 0.8431965112686157, + "learning_rate": 5.657359130197729e-05, + "loss": 0.9006875038146973, + "mean_token_accuracy": 0.7657369375228882, + "num_tokens": 285829.0, + "step": 100 + }, + { + "epoch": 0.24125452352231605, + "eval_entropy": 0.8651716802897078, + "eval_loss": 0.8829421997070312, + "eval_mean_token_accuracy": 0.768848463725508, + "eval_num_tokens": 285829.0, + "eval_runtime": 55.2839, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 100 + }, + { + "entropy": 0.8492659643292427, + "epoch": 0.28950542822677927, + "grad_norm": 1.3254178762435913, + "learning_rate": 6.800259964581109e-05, + "loss": 0.8246038436889649, + "mean_token_accuracy": 0.7797851234674453, + "num_tokens": 342830.0, + "step": 120 + }, + { + "epoch": 0.28950542822677927, + "eval_entropy": 0.8532627442579591, + "eval_loss": 0.8232717514038086, + "eval_mean_token_accuracy": 0.7797894574952929, + "eval_num_tokens": 342830.0, + "eval_runtime": 55.2655, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 120 + }, + { + "entropy": 0.8127924099564552, + "epoch": 0.33775633293124246, + "grad_norm": 1.236405849456787, + "learning_rate": 7.943160798964488e-05, + "loss": 0.7842514514923096, + "mean_token_accuracy": 0.784331226348877, + "num_tokens": 401404.0, + "step": 140 + }, + { + "epoch": 0.33775633293124246, + "eval_entropy": 0.8258234252420704, + "eval_loss": 0.7888523936271667, + "eval_mean_token_accuracy": 0.7806731449084335, + "eval_num_tokens": 401404.0, + "eval_runtime": 55.2732, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 140 + }, + { + "entropy": 0.7900068521499634, + "epoch": 0.38600723763570566, + "grad_norm": 1.213863492012024, + "learning_rate": 9.086061633347867e-05, + "loss": 0.7628804206848144, + "mean_token_accuracy": 0.7918283134698868, + "num_tokens": 455170.0, + "step": 160 + }, + { + "epoch": 0.38600723763570566, + "eval_entropy": 0.7623147050316414, + "eval_loss": 0.7571278810501099, + "eval_mean_token_accuracy": 0.7960286254293463, + "eval_num_tokens": 455170.0, + "eval_runtime": 55.2949, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 160 + }, + { + "entropy": 0.7525638103485107, + "epoch": 0.43425814234016885, + "grad_norm": 1.0159046649932861, + "learning_rate": 0.00010228962467731246, + "loss": 0.720798110961914, + "mean_token_accuracy": 0.8003803327679634, + "num_tokens": 513433.0, + "step": 180 + }, + { + "epoch": 0.43425814234016885, + "eval_entropy": 0.7300447925422968, + "eval_loss": 0.7215597033500671, + "eval_mean_token_accuracy": 0.8043364601188832, + "eval_num_tokens": 513433.0, + "eval_runtime": 55.235, + "eval_samples_per_second": 25.708, + "eval_steps_per_second": 3.223, + "step": 180 + }, + { + "entropy": 0.7320861473679543, + "epoch": 0.4825090470446321, + "grad_norm": 0.9363995790481567, + "learning_rate": 0.00011371863302114625, + "loss": 0.6982485771179199, + "mean_token_accuracy": 0.8094118356704711, + "num_tokens": 572252.0, + "step": 200 + }, + { + "epoch": 0.4825090470446321, + "eval_entropy": 0.7166647703460093, + "eval_loss": 0.7090815305709839, + "eval_mean_token_accuracy": 0.8082149638218826, + "eval_num_tokens": 572252.0, + "eval_runtime": 55.2736, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 200 + }, + { + "entropy": 0.7224139869213104, + "epoch": 0.5307599517490953, + "grad_norm": 0.8827612996101379, + "learning_rate": 0.00012514764136498005, + "loss": 0.6893723487854004, + "mean_token_accuracy": 0.8103836163878441, + "num_tokens": 626415.0, + "step": 220 + }, + { + "epoch": 0.5307599517490953, + "eval_entropy": 0.7198703587055206, + "eval_loss": 0.6869194507598877, + "eval_mean_token_accuracy": 0.8128652288002914, + "eval_num_tokens": 626415.0, + "eval_runtime": 55.224, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 3.223, + "step": 220 + }, + { + "entropy": 0.6983707025647163, + "epoch": 0.5790108564535585, + "grad_norm": 0.9153295159339905, + "learning_rate": 0.00013657664970881386, + "loss": 0.6662442207336425, + "mean_token_accuracy": 0.8164364024996758, + "num_tokens": 681802.0, + "step": 240 + }, + { + "epoch": 0.5790108564535585, + "eval_entropy": 0.729364558886946, + "eval_loss": 0.67768794298172, + "eval_mean_token_accuracy": 0.8129584832807605, + "eval_num_tokens": 681802.0, + "eval_runtime": 55.2739, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 240 + }, + { + "entropy": 0.6905479088425637, + "epoch": 0.6272617611580217, + "grad_norm": 0.6342670321464539, + "learning_rate": 0.00014800565805264765, + "loss": 0.6569931507110596, + "mean_token_accuracy": 0.8193552777171135, + "num_tokens": 734912.0, + "step": 260 + }, + { + "epoch": 0.6272617611580217, + "eval_entropy": 0.6885626697808169, + "eval_loss": 0.663456380367279, + "eval_mean_token_accuracy": 0.8180048070596845, + "eval_num_tokens": 734912.0, + "eval_runtime": 55.2691, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 260 + }, + { + "entropy": 0.6714175209403038, + "epoch": 0.6755126658624849, + "grad_norm": 0.6685030460357666, + "learning_rate": 0.00015943466639648145, + "loss": 0.6428029060363769, + "mean_token_accuracy": 0.8227180182933808, + "num_tokens": 794671.0, + "step": 280 + }, + { + "epoch": 0.6755126658624849, + "eval_entropy": 0.70999454949679, + "eval_loss": 0.6489622592926025, + "eval_mean_token_accuracy": 0.8230955952338959, + "eval_num_tokens": 794671.0, + "eval_runtime": 55.3103, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 280 + }, + { + "entropy": 0.6617054045200348, + "epoch": 0.7237635705669482, + "grad_norm": 0.7328742742538452, + "learning_rate": 0.00017086367474031526, + "loss": 0.6275388717651367, + "mean_token_accuracy": 0.8258797079324722, + "num_tokens": 851166.0, + "step": 300 + }, + { + "epoch": 0.7237635705669482, + "eval_entropy": 0.676628519644898, + "eval_loss": 0.6419883370399475, + "eval_mean_token_accuracy": 0.8190852996338619, + "eval_num_tokens": 851166.0, + "eval_runtime": 55.3104, + "eval_samples_per_second": 25.673, + "eval_steps_per_second": 3.218, + "step": 300 + }, + { + "entropy": 0.667515504360199, + "epoch": 0.7720144752714113, + "grad_norm": 0.6317798495292664, + "learning_rate": 0.00018229268308414903, + "loss": 0.6375722408294677, + "mean_token_accuracy": 0.823223651945591, + "num_tokens": 907742.0, + "step": 320 + }, + { + "epoch": 0.7720144752714113, + "eval_entropy": 0.7044156696019548, + "eval_loss": 0.6329143643379211, + "eval_mean_token_accuracy": 0.8259416510549824, + "eval_num_tokens": 907742.0, + "eval_runtime": 55.2563, + "eval_samples_per_second": 25.698, + "eval_steps_per_second": 3.221, + "step": 320 + }, + { + "entropy": 0.6768256008625031, + "epoch": 0.8202653799758746, + "grad_norm": 0.5729309916496277, + "learning_rate": 0.00019372169142798285, + "loss": 0.6379447937011719, + "mean_token_accuracy": 0.8250771954655647, + "num_tokens": 967618.0, + "step": 340 + }, + { + "epoch": 0.8202653799758746, + "eval_entropy": 0.6505340352821886, + "eval_loss": 0.6226425170898438, + "eval_mean_token_accuracy": 0.8273337463314614, + "eval_num_tokens": 967618.0, + "eval_runtime": 55.2588, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 340 + }, + { + "entropy": 0.6730180442333221, + "epoch": 0.8685162846803377, + "grad_norm": 0.5941824316978455, + "learning_rate": 0.00020515069977181664, + "loss": 0.6342682361602783, + "mean_token_accuracy": 0.8241883024573327, + "num_tokens": 1018333.0, + "step": 360 + }, + { + "epoch": 0.8685162846803377, + "eval_entropy": 0.6627941898415598, + "eval_loss": 0.6253094673156738, + "eval_mean_token_accuracy": 0.8266657035002548, + "eval_num_tokens": 1018333.0, + "eval_runtime": 55.2725, + "eval_samples_per_second": 25.691, + "eval_steps_per_second": 3.22, + "step": 360 + }, + { + "entropy": 0.6687290579080581, + "epoch": 0.916767189384801, + "grad_norm": 0.6753661036491394, + "learning_rate": 0.00021657970811565043, + "loss": 0.6109315872192382, + "mean_token_accuracy": 0.8278054997324944, + "num_tokens": 1069111.0, + "step": 380 + }, + { + "epoch": 0.916767189384801, + "eval_entropy": 0.6449436111731476, + "eval_loss": 0.6159152984619141, + "eval_mean_token_accuracy": 0.8295929308017987, + "eval_num_tokens": 1069111.0, + "eval_runtime": 55.2434, + "eval_samples_per_second": 25.704, + "eval_steps_per_second": 3.222, + "step": 380 + }, + { + "entropy": 0.6549551770091057, + "epoch": 0.9650180940892642, + "grad_norm": 0.6604854464530945, + "learning_rate": 0.00022800871645948422, + "loss": 0.6144959926605225, + "mean_token_accuracy": 0.8307996809482574, + "num_tokens": 1123008.0, + "step": 400 + }, + { + "epoch": 0.9650180940892642, + "eval_entropy": 0.6285942687412326, + "eval_loss": 0.6156108975410461, + "eval_mean_token_accuracy": 0.8301522892512633, + "eval_num_tokens": 1123008.0, + "eval_runtime": 55.2964, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "entropy": 0.6362387537956238, + "epoch": 1.0120627261761157, + "grad_norm": 0.6493216156959534, + "learning_rate": 0.00023715125200746723, + "loss": 0.5960193634033203, + "mean_token_accuracy": 0.831819243920155, + "num_tokens": 1177682.0, + "step": 420 + }, + { + "epoch": 1.0120627261761157, + "eval_entropy": 0.6311206628432434, + "eval_loss": 0.6114970445632935, + "eval_mean_token_accuracy": 0.8305217643802085, + "eval_num_tokens": 1177682.0, + "eval_runtime": 55.2941, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 420 + }, + { + "entropy": 0.6032533653080463, + "epoch": 1.060313630880579, + "grad_norm": 0.5768770575523376, + "learning_rate": 0.0002371277633572037, + "loss": 0.570250129699707, + "mean_token_accuracy": 0.8371127635240555, + "num_tokens": 1238001.0, + "step": 440 + }, + { + "epoch": 1.060313630880579, + "eval_entropy": 0.5940044235982253, + "eval_loss": 0.6010516285896301, + "eval_mean_token_accuracy": 0.8339078124989284, + "eval_num_tokens": 1238001.0, + "eval_runtime": 55.3141, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 440 + }, + { + "entropy": 0.6125824645161628, + "epoch": 1.1085645355850422, + "grad_norm": 0.5974647402763367, + "learning_rate": 0.00023707072594936633, + "loss": 0.5693985462188721, + "mean_token_accuracy": 0.8372680127620697, + "num_tokens": 1294494.0, + "step": 460 + }, + { + "epoch": 1.1085645355850422, + "eval_entropy": 0.6013931218492851, + "eval_loss": 0.6000593900680542, + "eval_mean_token_accuracy": 0.8335993320084689, + "eval_num_tokens": 1294494.0, + "eval_runtime": 55.3351, + "eval_samples_per_second": 25.662, + "eval_steps_per_second": 3.217, + "step": 460 + }, + { + "entropy": 0.6020087823271751, + "epoch": 1.1568154402895054, + "grad_norm": 0.5234053134918213, + "learning_rate": 0.00023698015592486674, + "loss": 0.5627901554107666, + "mean_token_accuracy": 0.8367442533373832, + "num_tokens": 1352928.0, + "step": 480 + }, + { + "epoch": 1.1568154402895054, + "eval_entropy": 0.6235448820202538, + "eval_loss": 0.5950364470481873, + "eval_mean_token_accuracy": 0.8320885056190277, + "eval_num_tokens": 1352928.0, + "eval_runtime": 55.2811, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 480 + }, + { + "entropy": 0.6143898621201516, + "epoch": 1.2050663449939687, + "grad_norm": 0.606967031955719, + "learning_rate": 0.00023685607891395062, + "loss": 0.5775506019592285, + "mean_token_accuracy": 0.8369954064488411, + "num_tokens": 1406356.0, + "step": 500 + }, + { + "epoch": 1.2050663449939687, + "eval_entropy": 0.6467455027813322, + "eval_loss": 0.5900489091873169, + "eval_mean_token_accuracy": 0.8357991258080086, + "eval_num_tokens": 1406356.0, + "eval_runtime": 55.2709, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 500 + }, + { + "entropy": 0.6138403750956059, + "epoch": 1.2533172496984317, + "grad_norm": 0.6079438328742981, + "learning_rate": 0.00023669853002894432, + "loss": 0.5609864711761474, + "mean_token_accuracy": 0.8403474077582359, + "num_tokens": 1464150.0, + "step": 520 + }, + { + "epoch": 1.2533172496984317, + "eval_entropy": 0.6021975442934572, + "eval_loss": 0.5859882831573486, + "eval_mean_token_accuracy": 0.8359727501199486, + "eval_num_tokens": 1464150.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 520 + }, + { + "entropy": 0.6103575885295868, + "epoch": 1.301568154402895, + "grad_norm": 0.576837420463562, + "learning_rate": 0.00023650755385431872, + "loss": 0.5683893680572509, + "mean_token_accuracy": 0.8384664133191109, + "num_tokens": 1521444.0, + "step": 540 + }, + { + "epoch": 1.301568154402895, + "eval_entropy": 0.5911824041872882, + "eval_loss": 0.5858258605003357, + "eval_mean_token_accuracy": 0.8374535170164001, + "eval_num_tokens": 1521444.0, + "eval_runtime": 55.2523, + "eval_samples_per_second": 25.7, + "eval_steps_per_second": 3.222, + "step": 540 + }, + { + "entropy": 0.6077366039156914, + "epoch": 1.3498190591073582, + "grad_norm": 0.5415759086608887, + "learning_rate": 0.00023628320443407213, + "loss": 0.563680362701416, + "mean_token_accuracy": 0.8382868468761444, + "num_tokens": 1576872.0, + "step": 560 + }, + { + "epoch": 1.3498190591073582, + "eval_entropy": 0.6217049782195788, + "eval_loss": 0.579742431640625, + "eval_mean_token_accuracy": 0.8369964655865444, + "eval_num_tokens": 1576872.0, + "eval_runtime": 55.2863, + "eval_samples_per_second": 25.684, + "eval_steps_per_second": 3.22, + "step": 560 + }, + { + "entropy": 0.5980370678007603, + "epoch": 1.3980699638118215, + "grad_norm": 0.5197780132293701, + "learning_rate": 0.00023602554525643677, + "loss": 0.5522702217102051, + "mean_token_accuracy": 0.8421810269355774, + "num_tokens": 1627009.0, + "step": 580 + }, + { + "epoch": 1.3980699638118215, + "eval_entropy": 0.6227023576417666, + "eval_loss": 0.5810565948486328, + "eval_mean_token_accuracy": 0.838038090909465, + "eval_num_tokens": 1627009.0, + "eval_runtime": 55.2757, + "eval_samples_per_second": 25.689, + "eval_steps_per_second": 3.22, + "step": 580 + }, + { + "entropy": 0.6068567186594009, + "epoch": 1.4463208685162847, + "grad_norm": 0.5295549035072327, + "learning_rate": 0.00023573464923591205, + "loss": 0.5554513931274414, + "mean_token_accuracy": 0.8414245262742043, + "num_tokens": 1682323.0, + "step": 600 + }, + { + "epoch": 1.4463208685162847, + "eval_entropy": 0.6000609700934271, + "eval_loss": 0.5768566131591797, + "eval_mean_token_accuracy": 0.839005763611097, + "eval_num_tokens": 1682323.0, + "eval_runtime": 55.2665, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 600 + }, + { + "entropy": 0.5905646570026875, + "epoch": 1.494571773220748, + "grad_norm": 0.5507094264030457, + "learning_rate": 0.00023541059869263081, + "loss": 0.5510271549224853, + "mean_token_accuracy": 0.8407903507351875, + "num_tokens": 1738743.0, + "step": 620 + }, + { + "epoch": 1.494571773220748, + "eval_entropy": 0.6138454079627991, + "eval_loss": 0.5707022547721863, + "eval_mean_token_accuracy": 0.8407475007384011, + "eval_num_tokens": 1738743.0, + "eval_runtime": 55.2612, + "eval_samples_per_second": 25.696, + "eval_steps_per_second": 3.221, + "step": 620 + }, + { + "entropy": 0.5900103107094765, + "epoch": 1.5428226779252112, + "grad_norm": 0.5121804475784302, + "learning_rate": 0.00023505348532906368, + "loss": 0.5467266082763672, + "mean_token_accuracy": 0.8411859899759293, + "num_tokens": 1794501.0, + "step": 640 + }, + { + "epoch": 1.5428226779252112, + "eval_entropy": 0.6089411131786496, + "eval_loss": 0.5724870562553406, + "eval_mean_token_accuracy": 0.8386286386613095, + "eval_num_tokens": 1794501.0, + "eval_runtime": 55.265, + "eval_samples_per_second": 25.694, + "eval_steps_per_second": 3.221, + "step": 640 + }, + { + "entropy": 0.5756346069276332, + "epoch": 1.5910735826296745, + "grad_norm": 0.5472006797790527, + "learning_rate": 0.00023466341020406828, + "loss": 0.5396484375, + "mean_token_accuracy": 0.8448511779308319, + "num_tokens": 1850301.0, + "step": 660 + }, + { + "epoch": 1.5910735826296745, + "eval_entropy": 0.5886324905277638, + "eval_loss": 0.5662708282470703, + "eval_mean_token_accuracy": 0.8412756119551283, + "eval_num_tokens": 1850301.0, + "eval_runtime": 55.2744, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 660 + }, + { + "entropy": 0.5833481803536416, + "epoch": 1.6393244873341375, + "grad_norm": 0.47462818026542664, + "learning_rate": 0.0002342404837042908, + "loss": 0.5445642471313477, + "mean_token_accuracy": 0.8443691149353981, + "num_tokens": 1905390.0, + "step": 680 + }, + { + "epoch": 1.6393244873341375, + "eval_entropy": 0.5821995529231061, + "eval_loss": 0.5642583966255188, + "eval_mean_token_accuracy": 0.8412673570466845, + "eval_num_tokens": 1905390.0, + "eval_runtime": 55.2999, + "eval_samples_per_second": 25.678, + "eval_steps_per_second": 3.219, + "step": 680 + }, + { + "entropy": 0.5866442531347275, + "epoch": 1.6875753920386007, + "grad_norm": 0.4118373990058899, + "learning_rate": 0.00023378482551292802, + "loss": 0.5519282341003418, + "mean_token_accuracy": 0.8430693671107292, + "num_tokens": 1963425.0, + "step": 700 + }, + { + "epoch": 1.6875753920386007, + "eval_entropy": 0.5955309136195129, + "eval_loss": 0.5613821744918823, + "eval_mean_token_accuracy": 0.8411754523770193, + "eval_num_tokens": 1963425.0, + "eval_runtime": 55.2957, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 700 + }, + { + "entropy": 0.5837626487016678, + "epoch": 1.7358262967430638, + "grad_norm": 0.41595396399497986, + "learning_rate": 0.00023329656457585815, + "loss": 0.5490932464599609, + "mean_token_accuracy": 0.8433916479349136, + "num_tokens": 2024685.0, + "step": 720 + }, + { + "epoch": 1.7358262967430638, + "eval_entropy": 0.5821482105536407, + "eval_loss": 0.558079719543457, + "eval_mean_token_accuracy": 0.8422878351104394, + "eval_num_tokens": 2024685.0, + "eval_runtime": 55.2982, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 720 + }, + { + "entropy": 0.5775028631091118, + "epoch": 1.784077201447527, + "grad_norm": 0.5807880759239197, + "learning_rate": 0.00023277583906515078, + "loss": 0.5457483768463135, + "mean_token_accuracy": 0.8433813333511353, + "num_tokens": 2083327.0, + "step": 740 + }, + { + "epoch": 1.784077201447527, + "eval_entropy": 0.5907773069116506, + "eval_loss": 0.5575461387634277, + "eval_mean_token_accuracy": 0.8427480385544595, + "eval_num_tokens": 2083327.0, + "eval_runtime": 55.2977, + "eval_samples_per_second": 25.679, + "eval_steps_per_second": 3.219, + "step": 740 + }, + { + "entropy": 0.5845035955309867, + "epoch": 1.8323281061519903, + "grad_norm": 0.7579953670501709, + "learning_rate": 0.0002322227963399659, + "loss": 0.534868860244751, + "mean_token_accuracy": 0.8444704949855805, + "num_tokens": 2136500.0, + "step": 760 + }, + { + "epoch": 1.8323281061519903, + "eval_entropy": 0.5573678276177203, + "eval_loss": 0.5549466609954834, + "eval_mean_token_accuracy": 0.8444588984666246, + "eval_num_tokens": 2136500.0, + "eval_runtime": 55.2935, + "eval_samples_per_second": 25.681, + "eval_steps_per_second": 3.219, + "step": 760 + }, + { + "entropy": 0.5598218090832233, + "epoch": 1.8805790108564535, + "grad_norm": 0.48678866028785706, + "learning_rate": 0.00023163759290485277, + "loss": 0.5248189449310303, + "mean_token_accuracy": 0.8493345126509666, + "num_tokens": 2192762.0, + "step": 780 + }, + { + "epoch": 1.8805790108564535, + "eval_entropy": 0.571697450923116, + "eval_loss": 0.553294837474823, + "eval_mean_token_accuracy": 0.8445497747887386, + "eval_num_tokens": 2192762.0, + "eval_runtime": 55.3123, + "eval_samples_per_second": 25.672, + "eval_steps_per_second": 3.218, + "step": 780 + }, + { + "entropy": 0.5655641779303551, + "epoch": 1.9288299155609168, + "grad_norm": 0.46138879656791687, + "learning_rate": 0.0002310203943654614, + "loss": 0.5277577877044678, + "mean_token_accuracy": 0.8480132848024369, + "num_tokens": 2249198.0, + "step": 800 + }, + { + "epoch": 1.9288299155609168, + "eval_entropy": 0.5671831344285708, + "eval_loss": 0.549659788608551, + "eval_mean_token_accuracy": 0.8456521740790164, + "eval_num_tokens": 2249198.0, + "eval_runtime": 55.2817, + "eval_samples_per_second": 25.687, + "eval_steps_per_second": 3.22, + "step": 800 + }, + { + "entropy": 0.5706607647240162, + "epoch": 1.97708082026538, + "grad_norm": 0.5946080088615417, + "learning_rate": 0.00023037137538167756, + "loss": 0.5285571098327637, + "mean_token_accuracy": 0.8492968618869782, + "num_tokens": 2302801.0, + "step": 820 + }, + { + "epoch": 1.97708082026538, + "eval_entropy": 0.5583421492509628, + "eval_loss": 0.5508614778518677, + "eval_mean_token_accuracy": 0.8449724641408813, + "eval_num_tokens": 2302801.0, + "eval_runtime": 55.2922, + "eval_samples_per_second": 25.682, + "eval_steps_per_second": 3.219, + "step": 820 + }, + { + "entropy": 0.5352686597750738, + "epoch": 2.0241254523522314, + "grad_norm": 0.517436146736145, + "learning_rate": 0.00022969071961819653, + "loss": 0.4967633247375488, + "mean_token_accuracy": 0.8539289052669818, + "num_tokens": 2358124.0, + "step": 840 + }, + { + "epoch": 2.0241254523522314, + "eval_entropy": 0.5460858092214285, + "eval_loss": 0.5541515946388245, + "eval_mean_token_accuracy": 0.8444636634896311, + "eval_num_tokens": 2358124.0, + "eval_runtime": 55.2792, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 840 + }, + { + "entropy": 0.5349089443683624, + "epoch": 2.0723763570566947, + "grad_norm": 0.48389914631843567, + "learning_rate": 0.00022897861969254802, + "loss": 0.4880528450012207, + "mean_token_accuracy": 0.8565173164010048, + "num_tokens": 2415855.0, + "step": 860 + }, + { + "epoch": 2.0723763570566947, + "eval_entropy": 0.5478990998830688, + "eval_loss": 0.5504991412162781, + "eval_mean_token_accuracy": 0.8451762336693452, + "eval_num_tokens": 2415855.0, + "eval_runtime": 55.274, + "eval_samples_per_second": 25.69, + "eval_steps_per_second": 3.22, + "step": 860 + }, + { + "entropy": 0.5228772208094596, + "epoch": 2.120627261761158, + "grad_norm": 0.6992365121841431, + "learning_rate": 0.00022823527712058763, + "loss": 0.47493491172790525, + "mean_token_accuracy": 0.8592865198850632, + "num_tokens": 2470986.0, + "step": 880 + }, + { + "epoch": 2.120627261761158, + "eval_entropy": 0.5613854529147737, + "eval_loss": 0.5500661730766296, + "eval_mean_token_accuracy": 0.8454208903098375, + "eval_num_tokens": 2470986.0, + "eval_runtime": 55.2832, + "eval_samples_per_second": 25.686, + "eval_steps_per_second": 3.22, + "step": 880 + }, + { + "entropy": 0.5406626127660275, + "epoch": 2.168878166465621, + "grad_norm": 0.5532678365707397, + "learning_rate": 0.00022746090225947036, + "loss": 0.49279079437255857, + "mean_token_accuracy": 0.8531969726085663, + "num_tokens": 2531414.0, + "step": 900 + }, + { + "epoch": 2.168878166465621, + "eval_entropy": 0.5460681235522367, + "eval_loss": 0.5487214922904968, + "eval_mean_token_accuracy": 0.846246014820056, + "eval_num_tokens": 2531414.0, + "eval_runtime": 55.2597, + "eval_samples_per_second": 25.697, + "eval_steps_per_second": 3.221, + "step": 900 + }, + { + "entropy": 0.5183610402047634, + "epoch": 2.2171290711700844, + "grad_norm": 0.4878793954849243, + "learning_rate": 0.0002266557142481219, + "loss": 0.47671008110046387, + "mean_token_accuracy": 0.8574348524212837, + "num_tokens": 2593991.0, + "step": 920 + }, + { + "epoch": 2.2171290711700844, + "eval_entropy": 0.5519453509804908, + "eval_loss": 0.5485875606536865, + "eval_mean_token_accuracy": 0.8460252378763777, + "eval_num_tokens": 2593991.0, + "eval_runtime": 55.2689, + "eval_samples_per_second": 25.693, + "eval_steps_per_second": 3.221, + "step": 920 + }, + { + "entropy": 0.5451609842479229, + "epoch": 2.2653799758745476, + "grad_norm": 0.48667117953300476, + "learning_rate": 0.00022581994094522502, + "loss": 0.4859492301940918, + "mean_token_accuracy": 0.8560267508029937, + "num_tokens": 2643389.0, + "step": 940 + }, + { + "epoch": 2.2653799758745476, + "eval_entropy": 0.535621813341473, + "eval_loss": 0.5473203063011169, + "eval_mean_token_accuracy": 0.8467726958601662, + "eval_num_tokens": 2643389.0, + "eval_runtime": 55.2696, + "eval_samples_per_second": 25.692, + "eval_steps_per_second": 3.221, + "step": 940 + }, + { + "entropy": 0.5411449268460273, + "epoch": 2.313630880579011, + "grad_norm": 0.415181964635849, + "learning_rate": 0.0002249538188647382, + "loss": 0.49844727516174314, + "mean_token_accuracy": 0.8529795065522194, + "num_tokens": 2697757.0, + "step": 960 + }, + { + "epoch": 2.313630880579011, + "eval_entropy": 0.5490157793412048, + "eval_loss": 0.5466434955596924, + "eval_mean_token_accuracy": 0.8461743238266934, + "eval_num_tokens": 2697757.0, + "eval_runtime": 55.2795, + "eval_samples_per_second": 25.688, + "eval_steps_per_second": 3.22, + "step": 960 + }, + { + "entropy": 0.5334192402660847, + "epoch": 2.361881785283474, + "grad_norm": 0.4685683250427246, + "learning_rate": 0.000224057593108965, + "loss": 0.4855259895324707, + "mean_token_accuracy": 0.8572103619575501, + "num_tokens": 2752288.0, + "step": 980 + }, + { + "epoch": 2.361881785283474, + "eval_entropy": 0.5256538374370403, + "eval_loss": 0.5443126559257507, + "eval_mean_token_accuracy": 0.8462888333904609, + "eval_num_tokens": 2752288.0, + "eval_runtime": 55.2968, + "eval_samples_per_second": 25.68, + "eval_steps_per_second": 3.219, + "step": 980 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.4843883414692352e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-100/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-100/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-100/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-100/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1000/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1000/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1020/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1020/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1020/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1020/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1020/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1020/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1040/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1040/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1040/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1040/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1040/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1040/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1060/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1060/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1060/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1060/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1060/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1060/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1080/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1080/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1080/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1080/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1080/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1080/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1100/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1100/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1100/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1100/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1120/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1120/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1120/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1120/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1140/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1140/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1140/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1140/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1140/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1140/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1160/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1160/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1160/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1160/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1160/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1160/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1180/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1180/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1180/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1180/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1180/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1180/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-120/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-120/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-120/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-120/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1200/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1200/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1220/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1220/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1220/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1220/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1220/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1220/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1240/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1240/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1240/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1240/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1240/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1240/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1260/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1260/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1260/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1260/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1260/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1260/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1280/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1280/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1280/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1280/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1280/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1280/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1300/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1300/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1300/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1300/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1320/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1320/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1320/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1320/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1320/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1320/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1340/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1340/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1340/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1340/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1340/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1340/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1360/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1360/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1360/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1360/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1360/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1360/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1380/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1380/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1380/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1380/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1380/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1380/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-140/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-140/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-140/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-140/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-140/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-140/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1400/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1400/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1420/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1420/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1420/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1420/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1420/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1420/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1440/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1440/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1440/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1440/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1440/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1440/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1460/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1460/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1460/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1460/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1460/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1460/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1480/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1480/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1480/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1480/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1480/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1480/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1500/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1500/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1520/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1520/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1520/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1520/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1520/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1520/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1540/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1540/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1540/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1540/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1540/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1540/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1560/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1560/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1560/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1560/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1560/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80 --- /dev/null +++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1560/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696 +size 6033 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1266/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1266/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b0c90d4dbe31bb6a72678e7a58829936e1617019 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1266/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbc0465e8a3e31834d5581e6d5ad0111152eeeff8261ecf97477bc6560ad49a0 +size 1057033224 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1266/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1266/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1266/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1266/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1266/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..99efd2efd190923f6b26ead2efb08510045ea154 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1266/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95177719630e08936790f1adeea112eef415eded432f4d94d97b1825873aee7c +size 5969 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1688/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1688/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5bda4f8538992e387e081475628655eb37f197fb --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1688/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98f955c8b28f525019d2314875439600a23046153ef0406f189759423adb2369 +size 1057033224 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1688/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1688/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1688/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1688/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1688/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..99efd2efd190923f6b26ead2efb08510045ea154 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1688/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95177719630e08936790f1adeea112eef415eded432f4d94d97b1825873aee7c +size 5969 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2110/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2110/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3beac069785c50f17db47944a931cb0bf16039cf --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2110/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2da87972606715aa14afa74bbb643282759cd29a58c8daf1d99c5c2a428eb740 +size 1057033224 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2110/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2110/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2110/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2110/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2110/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..99efd2efd190923f6b26ead2efb08510045ea154 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2110/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95177719630e08936790f1adeea112eef415eded432f4d94d97b1825873aee7c +size 5969 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2532/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2532/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a12eba43304212f561a144c6f9474d5cf17f83ca --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2532/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2e4095c03117a779057d4dad387e1f64fde6c43b0dc6703dc47091a6dad4984 +size 1057033224 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2532/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2532/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2532/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2532/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2532/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..99efd2efd190923f6b26ead2efb08510045ea154 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2532/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95177719630e08936790f1adeea112eef415eded432f4d94d97b1825873aee7c +size 5969 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2954/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2954/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2bed5e406632ba311db684d4f21f1b9f9dac942d --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2954/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df77e14f8eae9e8919861d0bd3fe77c7a7c2801b84cef67a910a8c9d4d5a3f9a +size 1057033224 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2954/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2954/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2954/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2954/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2954/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..99efd2efd190923f6b26ead2efb08510045ea154 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2954/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95177719630e08936790f1adeea112eef415eded432f4d94d97b1825873aee7c +size 5969 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3376/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3376/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8f3ef2251dcf97eb864a974a0a186c4fbdf77439 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3376/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ac17bf5a18a3ab69e42da7dcb73ad094de4c08046a089e3ab3b043bfea9a7a2 +size 1057033224 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3376/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3376/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3376/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3376/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3376/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..99efd2efd190923f6b26ead2efb08510045ea154 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3376/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95177719630e08936790f1adeea112eef415eded432f4d94d97b1825873aee7c +size 5969 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3798/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3798/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4722b7dc87a71d46cacffc47b9b4f6a6ce6b5f82 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3798/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e2ff0278a7e4a91bc38e5aa28c47097a32de7a907d9019095409330a4bfaaf4 +size 1057033224 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3798/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3798/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3798/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3798/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3798/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..99efd2efd190923f6b26ead2efb08510045ea154 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3798/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95177719630e08936790f1adeea112eef415eded432f4d94d97b1825873aee7c +size 5969 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-422/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-422/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2ea15347bed9dd0961cb18242b8e9db86f300718 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-422/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02b485400e995a42cb23ef66e33c7a12b156d4b4374e4aeac310474ae875c846 +size 1057033224 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-422/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-422/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-422/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-422/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-422/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..99efd2efd190923f6b26ead2efb08510045ea154 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-422/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95177719630e08936790f1adeea112eef415eded432f4d94d97b1825873aee7c +size 5969 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4220/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4220/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..669f07e27c1a2a2ce525bfc9b0ec6ce399987e88 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4220/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb6dfd27d1eb005e985b111f3fb3242204aeee1ce3ea2dfb79de157e5dc509cc +size 1057033224 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4220/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4220/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4220/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4220/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4220/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..99efd2efd190923f6b26ead2efb08510045ea154 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4220/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95177719630e08936790f1adeea112eef415eded432f4d94d97b1825873aee7c +size 5969 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-844/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-844/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ed21afcb8026f0f385c8f859fcaf9f6af359532d --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-844/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f6cba1f2575e88bbf364498e5cf0dd15841b60860b350dad3aef4f9be3a64cf +size 1057033224 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-844/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-844/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-844/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-844/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-844/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..99efd2efd190923f6b26ead2efb08510045ea154 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-844/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95177719630e08936790f1adeea112eef415eded432f4d94d97b1825873aee7c +size 5969 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1266/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1266/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b98227c79a177ecbd1ceaae2a71dde1c990af409 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1266/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f2f39d2ae0f5e794676bdddadd58765bc0813b4069e515cb2476aa18eba4996 +size 1057033224 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1266/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1266/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1266/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1266/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1266/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..20ebe0d32e9bd2cc668073dacd5148ca89c78625 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1266/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0df2bed10826027e1bb1376eeab6208c1e472c43974a3c94fd2b666987533ffb +size 5969 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1688/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1688/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1ad1f555b029ef4c1e0a6d9335a17d3170592384 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1688/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4804e08ee28a11ff0b5d7b95733e4feb359a6801a1d9bd972df3f8c506c3f37 +size 1057033224 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1688/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1688/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1688/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1688/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1688/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..20ebe0d32e9bd2cc668073dacd5148ca89c78625 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1688/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0df2bed10826027e1bb1376eeab6208c1e472c43974a3c94fd2b666987533ffb +size 5969 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2110/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2110/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..944de5a919a1cae75daafaab5d0a8b978c19a711 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2110/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c77c26855b7a49b60e247e7a3ecb8e3d704b197262872f15c9facbca0cb227b1 +size 1057033224 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2110/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2110/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2110/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2110/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2110/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..20ebe0d32e9bd2cc668073dacd5148ca89c78625 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2110/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0df2bed10826027e1bb1376eeab6208c1e472c43974a3c94fd2b666987533ffb +size 5969 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2532/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2532/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f742b23bacd8f72d8be841693b1b1df3ff318b12 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2532/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c401c5fba74635a8b3a64d2ec7f449afcbf1b7b94689c7cc2ecba9f1f636d241 +size 1057033224 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2532/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2532/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2532/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2532/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2532/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..20ebe0d32e9bd2cc668073dacd5148ca89c78625 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2532/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0df2bed10826027e1bb1376eeab6208c1e472c43974a3c94fd2b666987533ffb +size 5969 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2954/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2954/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4dc51e5628ae4eb01a4f0ec237ea51060ca97c88 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2954/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3af25f716a51726569c06809284291bd27d42db4003364d44bebbf3a1ce55342 +size 1057033224 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2954/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2954/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2954/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2954/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2954/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..20ebe0d32e9bd2cc668073dacd5148ca89c78625 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2954/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0df2bed10826027e1bb1376eeab6208c1e472c43974a3c94fd2b666987533ffb +size 5969 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3376/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3376/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5232ac8f61379fa08a0714376988404528e95906 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3376/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbef709b106a321383a66ec14cc970cdf7eb2b7b748bfe3905e31524c09a4eef +size 1057033224 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3376/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3376/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3376/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3376/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3376/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..20ebe0d32e9bd2cc668073dacd5148ca89c78625 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3376/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0df2bed10826027e1bb1376eeab6208c1e472c43974a3c94fd2b666987533ffb +size 5969 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3798/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3798/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7ff63a320f84be99df499f50274ce5f2c81b3534 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3798/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c54cf6e72e7c873d53b7617c915432ac66f7c63777f889def5c57bfbf1e6a074 +size 1057033224 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3798/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3798/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3798/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3798/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3798/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..20ebe0d32e9bd2cc668073dacd5148ca89c78625 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3798/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0df2bed10826027e1bb1376eeab6208c1e472c43974a3c94fd2b666987533ffb +size 5969 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-422/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-422/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..12ec2005856a8c953b1e6f82e4ed2a5a0717649e --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-422/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7311f2d0bb6d2db2650faf6cc04b7562a8391ee347739da749527b56b04eaf99 +size 1057033224 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-422/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-422/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-422/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-422/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-422/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..20ebe0d32e9bd2cc668073dacd5148ca89c78625 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-422/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0df2bed10826027e1bb1376eeab6208c1e472c43974a3c94fd2b666987533ffb +size 5969 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4220/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4220/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8d85ba9879984d9921806478f46c82e3138f0c2d --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4220/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09621bae85f1c6494b2b3b15c3b968315f14bc41e256567b729af19c7e3f5cb6 +size 1057033224 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4220/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4220/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4220/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4220/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4220/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..20ebe0d32e9bd2cc668073dacd5148ca89c78625 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4220/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0df2bed10826027e1bb1376eeab6208c1e472c43974a3c94fd2b666987533ffb +size 5969 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-844/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-844/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0bf5570f0cd7ca65bac64058da3a08cb28823e82 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-844/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbc05d8fe4ac51af63319fef49b05f2bb4f32a2a8108268d5425a7588c7175e3 +size 1057033224 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-844/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-844/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-844/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-844/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-844/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..20ebe0d32e9bd2cc668073dacd5148ca89c78625 --- /dev/null +++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-844/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0df2bed10826027e1bb1376eeab6208c1e472c43974a3c94fd2b666987533ffb +size 5969 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1248/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1248/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ce428b1631fad3e767ca430b23d1ef43b10e3864 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1248/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:486dd29b5f6b93bc43f1b9af9e7ce4c81dffa071f4f1f907f7a0b40c2545430a +size 132187888 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1248/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1248/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1248/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1248/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1248/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3fafc70b589d6e0520076fdac27e04be46598a54 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1248/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da0a7d9c0cf6292257087e32deb7b0d08b81271ded4a2b606e806480fba0a89f +size 5969 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1664/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1664/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1f3a322cc4eb0510ff5cb9385d917b76db1fdbac --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1664/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5d17e8a5e5422b0e365731f316a518dc79694bf3de26efd8b296dbbc2b69303 +size 132187888 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1664/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1664/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1664/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1664/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1664/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3fafc70b589d6e0520076fdac27e04be46598a54 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1664/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da0a7d9c0cf6292257087e32deb7b0d08b81271ded4a2b606e806480fba0a89f +size 5969 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2080/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2080/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..db3ee55611341a6ba978d64f2a898efe9ef997c3 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2080/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3033dcdcb0e9135c8bca97563001e0cd476b2a6874e7dd3ff1c0e87890bbeef2 +size 132187888 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2080/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2080/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2080/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2080/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2080/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3fafc70b589d6e0520076fdac27e04be46598a54 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2080/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da0a7d9c0cf6292257087e32deb7b0d08b81271ded4a2b606e806480fba0a89f +size 5969 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2496/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2496/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..27e5bcdb1df00b2d6aa40ebf240a0526990af029 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2496/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b268ec0ab1ded0b1cf807843d427c8ca1d5a58f99923a1fb2a0f19be90df959 +size 132187888 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2496/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2496/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2496/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2496/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2496/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3fafc70b589d6e0520076fdac27e04be46598a54 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2496/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da0a7d9c0cf6292257087e32deb7b0d08b81271ded4a2b606e806480fba0a89f +size 5969 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2912/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2912/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8342c0cd7b8ce0631547c5665e27e479b6fb7427 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2912/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b2247313954fdcc3dc52da901c91df46323a60acd30537e46732ac047a8d16c +size 132187888 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2912/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2912/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2912/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2912/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2912/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3fafc70b589d6e0520076fdac27e04be46598a54 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2912/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da0a7d9c0cf6292257087e32deb7b0d08b81271ded4a2b606e806480fba0a89f +size 5969 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3328/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3328/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a7b7d0b4134f7667b3c6860707ab93b3c361ea9b --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3328/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b4893e6f0d53e4fbbd76835ae9175a4db7d918e3f80d8ca44eb542f184a7a40 +size 132187888 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3328/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3328/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3328/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3328/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3328/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3fafc70b589d6e0520076fdac27e04be46598a54 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3328/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da0a7d9c0cf6292257087e32deb7b0d08b81271ded4a2b606e806480fba0a89f +size 5969 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3744/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3744/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..294b11b71a4b9bc2166046d8fa375b8f26e74353 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3744/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2abe2c724d7af7d15cc652e2fe530354df64dde50c0ccadd9ea4b52ec955b403 +size 132187888 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3744/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3744/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3744/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3744/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3744/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3fafc70b589d6e0520076fdac27e04be46598a54 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3744/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da0a7d9c0cf6292257087e32deb7b0d08b81271ded4a2b606e806480fba0a89f +size 5969 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-416/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-416/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7d7163298fb9416526777e6474aa70d9b41196e5 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-416/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5baeb8ce8698b4f981f6cd28c7d7b942e750f35955dcff00f87d8ea1b95c585d +size 132187888 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-416/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-416/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-416/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-416/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-416/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3fafc70b589d6e0520076fdac27e04be46598a54 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-416/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da0a7d9c0cf6292257087e32deb7b0d08b81271ded4a2b606e806480fba0a89f +size 5969 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4160/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4160/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a9cd7f5f9852409c134823588e5106c68f5fa8ca --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4160/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee2cea8460aa29bb1d0932624b31c8e7abd00416e2208ae9eb5aea1960598d24 +size 132187888 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4160/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4160/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4160/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4160/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4160/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3fafc70b589d6e0520076fdac27e04be46598a54 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4160/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da0a7d9c0cf6292257087e32deb7b0d08b81271ded4a2b606e806480fba0a89f +size 5969 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-832/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-832/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..44efa3031afe66ca6b0d9c82ac658f6780cab2e1 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-832/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d5e28929beff3fa99ab1fb1ac093702e413931937da1b2d6d0c878db5c41344 +size 132187888 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-832/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-832/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-832/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-832/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-832/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3fafc70b589d6e0520076fdac27e04be46598a54 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-832/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da0a7d9c0cf6292257087e32deb7b0d08b81271ded4a2b606e806480fba0a89f +size 5969 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1248/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1248/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9cfa2a6ff3ce788cf9c12c4afb88b9ca2c94e46c --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1248/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b7e4987b1586d1a55ed2276403a9b869985bf94f8939bd6dea92f7bd2605213 +size 528550256 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1248/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1248/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1248/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1248/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1248/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a440f50784e9505c9d6e4a015cefc759796ec4d7 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1248/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba3dd5808b5910ef225f7fefb4fa29294ee03c7397c1856f7855760414821deb +size 5969 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1664/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1664/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..399f317e9848f7b5acb368616ebc3e99d6a47eae --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1664/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8b8fa125ed5440c1168e74ce33d492de5d53155fb6cb4d1f6d649b4f24820ba +size 528550256 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1664/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1664/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1664/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1664/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1664/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a440f50784e9505c9d6e4a015cefc759796ec4d7 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1664/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba3dd5808b5910ef225f7fefb4fa29294ee03c7397c1856f7855760414821deb +size 5969 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2080/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2080/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0d3c8b937722192299536f8f04a20c3c7f68fd72 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2080/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:833f2c0917396b417eacef3cacfeca122695b4e007faf162a6afb68d9e386b6d +size 528550256 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2080/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2080/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2080/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2080/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2080/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a440f50784e9505c9d6e4a015cefc759796ec4d7 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2080/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba3dd5808b5910ef225f7fefb4fa29294ee03c7397c1856f7855760414821deb +size 5969 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2496/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2496/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f758a4f542b72427552557b7da496b6ef5756dec --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2496/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:121f4441d5798e30d8e650d939fa4da776bc46931d49b9447c93cb2caafaf8bf +size 528550256 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2496/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2496/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2496/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2496/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2496/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a440f50784e9505c9d6e4a015cefc759796ec4d7 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2496/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba3dd5808b5910ef225f7fefb4fa29294ee03c7397c1856f7855760414821deb +size 5969 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2912/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2912/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..439786ef2da27e0e9b15c465aa14617be56782e5 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2912/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfefb8f1aee9a9f521cb74126896324b6b2afde5ebb138ca525d152c983922e2 +size 528550256 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2912/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2912/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2912/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2912/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2912/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a440f50784e9505c9d6e4a015cefc759796ec4d7 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2912/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba3dd5808b5910ef225f7fefb4fa29294ee03c7397c1856f7855760414821deb +size 5969 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3328/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3328/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8840f8ee659fa7ec7889a6813bdcc0abf126654b --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3328/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a37dbf1e87deb6eacd3a4623e018cc0e710b7c195ab6a30244f3622ebecd0f4a +size 528550256 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3328/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3328/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3328/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3328/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3328/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a440f50784e9505c9d6e4a015cefc759796ec4d7 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3328/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba3dd5808b5910ef225f7fefb4fa29294ee03c7397c1856f7855760414821deb +size 5969 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3744/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3744/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fe187f7bf47b0e7ceb93f608f0fe1cde05dba5c8 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3744/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c2f4653542268dfc392b93cc5750cc210e0f330fae012b9c28d96661c295a66 +size 528550256 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3744/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3744/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3744/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3744/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3744/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a440f50784e9505c9d6e4a015cefc759796ec4d7 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3744/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba3dd5808b5910ef225f7fefb4fa29294ee03c7397c1856f7855760414821deb +size 5969 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-416/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-416/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1f48becf1d9f6c9a810560f2a41e354e24fba91b --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-416/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ed6a3a449129e9cf5dde5daee7bbd7c69fb87e22052ca65164937249e0bdb7b +size 528550256 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-416/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-416/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-416/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-416/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-416/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a440f50784e9505c9d6e4a015cefc759796ec4d7 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-416/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba3dd5808b5910ef225f7fefb4fa29294ee03c7397c1856f7855760414821deb +size 5969 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4160/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4160/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..54c585e94330c0d02094878cbc1a87538dc3df7d --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4160/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43a7fd8a735846be654071534beccd5cc59a8b91db968965935159995596b259 +size 528550256 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4160/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4160/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4160/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4160/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4160/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a440f50784e9505c9d6e4a015cefc759796ec4d7 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4160/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba3dd5808b5910ef225f7fefb4fa29294ee03c7397c1856f7855760414821deb +size 5969 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-832/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-832/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9384b871b0f4e27005ba84dcc562dae49e931da5 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-832/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2aa468c76942d264d6265b2d7518c5460f4a63f0e0191383485a46672021d23 +size 528550256 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-832/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-832/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-832/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-832/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-832/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a440f50784e9505c9d6e4a015cefc759796ec4d7 --- /dev/null +++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-832/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba3dd5808b5910ef225f7fefb4fa29294ee03c7397c1856f7855760414821deb +size 5969 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1266/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1266/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8a3ef83e45551fd44e02c30a466fa4d16b6d3081 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1266/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a305489717ffb7f09694d37800511fc8105e970d422da845485beacda648e805 +size 528550256 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1266/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1266/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1266/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1266/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1266/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6a14a967df058442729af6156a5dd1378874b039 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1266/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bd6716430261876870f9d8a9a13c030ff118c36e6dde661af5760e2fb3ae358 +size 6033 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1688/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1688/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3c6c5929b165d6927d7fb9f203e47dddc2d9e39d --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1688/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed3ca7e462a6f9dda89543c44e3df8eb6630e9c9a8810febe416f1b53c6647cb +size 528550256 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1688/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1688/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1688/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1688/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1688/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6a14a967df058442729af6156a5dd1378874b039 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1688/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bd6716430261876870f9d8a9a13c030ff118c36e6dde661af5760e2fb3ae358 +size 6033 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2110/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2110/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..80a999bf5498d0528936e4386d47dfac9d9e86c7 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2110/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:871529db5fc0121f990b9ba5cb1b81d4bd577f9777c45a7425aa3f5ce97e22d2 +size 528550256 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2110/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2110/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2110/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2110/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2110/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6a14a967df058442729af6156a5dd1378874b039 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2110/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bd6716430261876870f9d8a9a13c030ff118c36e6dde661af5760e2fb3ae358 +size 6033 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2532/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2532/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..59a6aad946a782e9cf92d5d31edc7de20ac66052 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2532/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13b70c2d156f9524059ddf23c74ab1c8fde7f7989e916d2c9dab9896a858021e +size 528550256 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2532/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2532/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2532/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2532/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2532/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6a14a967df058442729af6156a5dd1378874b039 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2532/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bd6716430261876870f9d8a9a13c030ff118c36e6dde661af5760e2fb3ae358 +size 6033 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2954/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2954/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4a4880529704a423c0317f524419250a207f39ee --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2954/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b5410a1e2bfe1f4f1b1285f558381e41ed44c5270eac59d2a712747a7e674e9 +size 528550256 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2954/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2954/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2954/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2954/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2954/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6a14a967df058442729af6156a5dd1378874b039 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2954/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bd6716430261876870f9d8a9a13c030ff118c36e6dde661af5760e2fb3ae358 +size 6033 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3376/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3376/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..98b34d771c7d1a01201d034df2531109334e3730 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3376/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de14455a0c67a5b03c277d24e9cf24137c1e5d163329d0de56c0073cc7824d19 +size 528550256 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3376/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3376/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3376/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3376/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3376/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6a14a967df058442729af6156a5dd1378874b039 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3376/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bd6716430261876870f9d8a9a13c030ff118c36e6dde661af5760e2fb3ae358 +size 6033 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3798/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3798/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2adee9fff9a2d2bdd621c45ef01b49834f23169f --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3798/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1b86e6e53406383857bceb893a71ab43d661371de4cab6c55af683ecd39d614 +size 528550256 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3798/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3798/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3798/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3798/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3798/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6a14a967df058442729af6156a5dd1378874b039 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3798/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bd6716430261876870f9d8a9a13c030ff118c36e6dde661af5760e2fb3ae358 +size 6033 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-422/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-422/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6d303f053aa87b837b1f54ebce0a95f982ead023 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-422/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b2fb11013528d4f89f479d0314835588ce1134e07a7c0ddf5969097d854685c +size 528550256 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-422/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-422/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-422/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-422/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-422/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6a14a967df058442729af6156a5dd1378874b039 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-422/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bd6716430261876870f9d8a9a13c030ff118c36e6dde661af5760e2fb3ae358 +size 6033 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4220/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4220/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..69641f99c90f4d3650c29d4ea9157c96d1379445 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4220/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff2b37f0486dc225ee19bb58233340c01582cbe6a0e6b757c92fda732d488340 +size 528550256 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4220/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4220/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4220/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4220/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4220/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6a14a967df058442729af6156a5dd1378874b039 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4220/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bd6716430261876870f9d8a9a13c030ff118c36e6dde661af5760e2fb3ae358 +size 6033 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-844/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-844/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fcf9438fccfcef51075f1ab11261959d8954c1ca --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-844/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0df7b78b191b65ba9bfda9f8ac4e94553d23e873c77250c78c5a3a5a33304292 +size 528550256 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-844/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-844/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-844/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-844/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-844/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6a14a967df058442729af6156a5dd1378874b039 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-844/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bd6716430261876870f9d8a9a13c030ff118c36e6dde661af5760e2fb3ae358 +size 6033 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1266/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1266/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8f0ec3dc9c343a2c369c5c7897b2025dd0c3de5d --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1266/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf1b946ce8888b60617677c221d4f9345a1a90f299d8c81589a4297328f6f9cb +size 1057033224 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1688/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1688/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1891d78d680487f3fcd81617a9be4094a5dd444c --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1688/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:432d4c151b4f2e819fe75b03bac489ef4066f67461c3b62362523ff204cd5cfd +size 1057033224 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1688/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1688/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1688/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1688/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1688/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e4d1131b805c0250f1e517771932318a8cb06f57 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1688/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:239bcb1638f2f0880dabc8a510a3159ecd9462dda9028bdcd7cf5f3253b98190 +size 6033 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2110/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2110/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..415e93313a151c45f73744a8f85628bf6fd95fdc --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2110/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1769a198912e748ca0e5ea0bf306afdfd5f6f6edf49959437427c0403dba1660 +size 1057033224 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2110/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2110/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2110/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2110/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2110/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e4d1131b805c0250f1e517771932318a8cb06f57 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2110/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:239bcb1638f2f0880dabc8a510a3159ecd9462dda9028bdcd7cf5f3253b98190 +size 6033 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2532/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2532/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7d5b245889666232789df267ce760e730f79cff4 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2532/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b828533887ff4c3775d0145ce96efa3497149a3196b06745d6992bc7888ddcf +size 1057033224 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2532/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2532/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2532/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2532/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2532/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e4d1131b805c0250f1e517771932318a8cb06f57 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2532/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:239bcb1638f2f0880dabc8a510a3159ecd9462dda9028bdcd7cf5f3253b98190 +size 6033 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2954/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2954/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..64b45a7631a0d9c38948303621f42915744e5895 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2954/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:541c432be66eca44eb5090d931bb8dec7d1b5e6434922a136b5fb8b069d35d2e +size 1057033224 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2954/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2954/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2954/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2954/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2954/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e4d1131b805c0250f1e517771932318a8cb06f57 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2954/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:239bcb1638f2f0880dabc8a510a3159ecd9462dda9028bdcd7cf5f3253b98190 +size 6033 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3376/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3376/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d18863db62a6976a50722f029fa087e3bad82916 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3376/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b17b8c5ac151dc836de67b44b428c7202ac2fc49f6875f8782ce0ac0bab0141 +size 1057033224 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3376/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3376/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3376/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3376/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3376/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e4d1131b805c0250f1e517771932318a8cb06f57 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3376/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:239bcb1638f2f0880dabc8a510a3159ecd9462dda9028bdcd7cf5f3253b98190 +size 6033 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3798/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3798/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a8a8757633d0429a28785790a48928a4b1cade99 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3798/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08338e683f664fb43bd57f2f1ad9784286222c25cf329dd6287c832d35a88d2c +size 1057033224 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3798/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3798/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3798/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3798/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3798/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e4d1131b805c0250f1e517771932318a8cb06f57 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3798/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:239bcb1638f2f0880dabc8a510a3159ecd9462dda9028bdcd7cf5f3253b98190 +size 6033 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-422/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-422/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..75580e83603d3bf49d57a0a1aab20eca05467fb6 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-422/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:060618a44d1a4d11107a689b1faec187ceba439f81351ea758bbcdd01f4efe5a +size 1057033224 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-422/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-422/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-422/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-422/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-422/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e4d1131b805c0250f1e517771932318a8cb06f57 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-422/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:239bcb1638f2f0880dabc8a510a3159ecd9462dda9028bdcd7cf5f3253b98190 +size 6033 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4220/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4220/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..651313a886f88529351e27d6614f434288835c16 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4220/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a93bf8441a40523d9705d5a84931485ee313190390af6a79a3de7e0a8cc6ed29 +size 1057033224 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4220/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4220/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4220/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4220/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4220/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e4d1131b805c0250f1e517771932318a8cb06f57 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4220/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:239bcb1638f2f0880dabc8a510a3159ecd9462dda9028bdcd7cf5f3253b98190 +size 6033 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-844/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-844/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7f5cb096f41ccfe8e574d77577146f11a1493b01 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-844/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a29c3c29d49743bfbddbe6d728f7d24d152490aa110bc32ccc026195f0957c6 +size 1057033224 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-844/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-844/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-844/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-844/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-844/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e4d1131b805c0250f1e517771932318a8cb06f57 --- /dev/null +++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-844/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:239bcb1638f2f0880dabc8a510a3159ecd9462dda9028bdcd7cf5f3253b98190 +size 6033 diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3744/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3744/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..eeb961260daab308690ee2424014d017c81948ff --- /dev/null +++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3744/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92041ff3f6c36f2c0eba07422557339fa9c5291f3c862d1c259c5a20a2b5e617 +size 132187888 diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-416/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-416/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..93f546b7ff4208301d1d56632f05d992da2f9787 --- /dev/null +++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-416/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3282ecdf3e931422bb3dcf90b3ab7c327640022c3816e8176ad272aed74bbeb2 +size 132187888 diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4160/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4160/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..98eb25e7492c2d265c9caac0cb59306a62752da5 --- /dev/null +++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4160/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fddf99881d0157cd7ce7286527f4f0b5f1497ded101b726a95b7030cc6361e0 +size 132187888 diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-832/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-832/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1c88555eda53efba8e381a14444e761377fa6557 --- /dev/null +++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-832/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dd25ec4e3e2a4ba2a47a382979cc571755290d6e004dec9681e1f5e7f790260 +size 132187888 diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1248/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1248/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1afb2db769ef6bc745ae9a969f35e592271b6853 --- /dev/null +++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1248/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df79709ef7b8e9d0108929e9d6d9e6219fc0e71dc0bf461880764bea234c2802 +size 1057033224 diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1664/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1664/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..917f9ba91cb6dc51e837459c71c41ef46d414c0d --- /dev/null +++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1664/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4028d8243e88a1ed8653bb6f1ae72125cc50df50f3c00af5503aab4a9b8b03a +size 1057033224 diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2080/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2080/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0758a013282d084b219cb1a09d6fcdd981a805ef --- /dev/null +++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2080/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e1f59a4947a3ada4ccecd49140aa5614a14c1d2a4b7a385eac2abc7979be1d7 +size 1057033224 diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2496/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2496/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a1f61f61ea41667466578cf2656c3aeabd0c94f2 --- /dev/null +++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2496/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37ad97e8609611d2493dbcaf94b0111929ac576e7faf32a2b00ab35e2d239d64 +size 1057033224 diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2912/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2912/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5946483d879dff2e6eb1e2cf4531552a299106b7 --- /dev/null +++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2912/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc9c5790f7c7e6eec0ce381b5528e905a301910027cc490f25212c1442e068f5 +size 1057033224 diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3328/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3328/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..77cbd320654f1aceba60a6be42c69eaad316b078 --- /dev/null +++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3328/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b84b12a897a4b643e8092814dbc2b91333786bcfc8c45e159d58a08aa3c67c06 +size 1057033224 diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3744/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3744/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6f43794ef309cbd9b81142cd9f4608ee3d17f4ec --- /dev/null +++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3744/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ee9e4e813a7a4f4b702573295829d04fd03f54f1b6cd2d77b31911491e2fb4d +size 1057033224 diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-416/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-416/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a3ca9f8d1072593bd648832cdd6495c4ccd28f99 --- /dev/null +++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-416/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc523dd7dac2217bdd4e0cff500f687607e9a64260ae620fed7277afe1a33bae +size 1057033224 diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4160/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4160/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..84e5dc7c8501227ab609efd2c72304c05137b455 --- /dev/null +++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4160/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7647b1aa2182b24cd8ec9330cbd6107d057c584474b28660e5f8465847df3318 +size 1057033224 diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-832/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-832/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f71bd1034792b14cb52aec864e5d56320caac943 --- /dev/null +++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-832/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1e70d71bfb1a3bc125633e6457c56a6f9cf8fd76cf63d131555db20486985be +size 1057033224